LCOV - differential code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Coverage Total Hit UNC LBC UIC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 92.2 % 1503 1386 45 20 52 26 667 413 280 74 906 17 180
Current Date: 2023-04-08 15:15:32 Functions: 97.5 % 80 78 2 57 21 2 68 10
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * bufmgr.c
       4                 :  *    buffer manager interface routines
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/storage/buffer/bufmgr.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : /*
      16                 :  * Principal entry points:
      17                 :  *
      18                 :  * ReadBuffer() -- find or create a buffer holding the requested page,
      19                 :  *      and pin it so that no one can destroy it while this process
      20                 :  *      is using it.
      21                 :  *
      22                 :  * ReleaseBuffer() -- unpin a buffer
      23                 :  *
      24                 :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
      25                 :  *      The disk write is delayed until buffer replacement or checkpoint.
      26                 :  *
      27                 :  * See also these files:
      28                 :  *      freelist.c -- chooses victim for buffer replacement
      29                 :  *      buf_table.c -- manages the buffer lookup table
      30                 :  */
      31                 : #include "postgres.h"
      32                 : 
      33                 : #include <sys/file.h>
      34                 : #include <unistd.h>
      35                 : 
      36                 : #include "access/tableam.h"
      37                 : #include "access/xloginsert.h"
      38                 : #include "access/xlogutils.h"
      39                 : #include "catalog/catalog.h"
      40                 : #include "catalog/storage.h"
      41                 : #include "catalog/storage_xlog.h"
      42                 : #include "executor/instrument.h"
      43                 : #include "lib/binaryheap.h"
      44                 : #include "miscadmin.h"
      45                 : #include "pg_trace.h"
      46                 : #include "pgstat.h"
      47                 : #include "postmaster/bgwriter.h"
      48                 : #include "storage/buf_internals.h"
      49                 : #include "storage/bufmgr.h"
      50                 : #include "storage/ipc.h"
      51                 : #include "storage/lmgr.h"
      52                 : #include "storage/proc.h"
      53                 : #include "storage/smgr.h"
      54                 : #include "storage/standby.h"
      55                 : #include "utils/memdebug.h"
      56                 : #include "utils/ps_status.h"
      57                 : #include "utils/rel.h"
      58                 : #include "utils/resowner_private.h"
      59                 : #include "utils/timestamp.h"
      60                 : 
      61                 : 
      62                 : /* Note: these two macros only work on shared buffers, not local ones! */
      63                 : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
      64                 : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
      65                 : 
      66                 : /* Note: this macro only works on local buffers, not shared ones! */
      67                 : #define LocalBufHdrGetBlock(bufHdr) \
      68                 :     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
      69                 : 
      70                 : /* Bits in SyncOneBuffer's return value */
      71                 : #define BUF_WRITTEN             0x01
      72                 : #define BUF_REUSABLE            0x02
      73                 : 
      74                 : #define RELS_BSEARCH_THRESHOLD      20
      75                 : 
      76                 : /*
      77                 :  * This is the size (in the number of blocks) above which we scan the
      78                 :  * entire buffer pool to remove the buffers for all the pages of relation
      79                 :  * being dropped. For the relations with size below this threshold, we find
      80                 :  * the buffers by doing lookups in BufMapping table.
      81                 :  */
      82                 : #define BUF_DROP_FULL_SCAN_THRESHOLD        (uint64) (NBuffers / 32)
      83                 : 
      84                 : typedef struct PrivateRefCountEntry
      85                 : {
      86                 :     Buffer      buffer;
      87                 :     int32       refcount;
      88                 : } PrivateRefCountEntry;
      89                 : 
      90                 : /* 64 bytes, about the size of a cache line on common systems */
      91                 : #define REFCOUNT_ARRAY_ENTRIES 8
      92                 : 
      93                 : /*
      94                 :  * Status of buffers to checkpoint for a particular tablespace, used
      95                 :  * internally in BufferSync.
      96                 :  */
      97                 : typedef struct CkptTsStatus
      98                 : {
      99                 :     /* oid of the tablespace */
     100                 :     Oid         tsId;
     101                 : 
     102                 :     /*
     103                 :      * Checkpoint progress for this tablespace. To make progress comparable
     104                 :      * between tablespaces the progress is, for each tablespace, measured as a
     105                 :      * number between 0 and the total number of to-be-checkpointed pages. Each
     106                 :      * page checkpointed in this tablespace increments this space's progress
     107                 :      * by progress_slice.
     108                 :      */
     109                 :     float8      progress;
     110                 :     float8      progress_slice;
     111                 : 
     112                 :     /* number of to-be checkpointed pages in this tablespace */
     113                 :     int         num_to_scan;
     114                 :     /* already processed pages in this tablespace */
     115                 :     int         num_scanned;
     116                 : 
     117                 :     /* current offset in CkptBufferIds for this tablespace */
     118                 :     int         index;
     119                 : } CkptTsStatus;
     120                 : 
     121                 : /*
     122                 :  * Type for array used to sort SMgrRelations
     123                 :  *
     124                 :  * FlushRelationsAllBuffers shares the same comparator function with
     125                 :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
     126                 :  * compatible.
     127                 :  */
     128                 : typedef struct SMgrSortArray
     129                 : {
     130                 :     RelFileLocator rlocator;    /* This must be the first member */
     131                 :     SMgrRelation srel;
     132                 : } SMgrSortArray;
     133                 : 
     134                 : /* GUC variables */
     135                 : bool        zero_damaged_pages = false;
     136                 : int         bgwriter_lru_maxpages = 100;
     137                 : double      bgwriter_lru_multiplier = 2.0;
     138                 : bool        track_io_timing = false;
     139                 : 
     140                 : /*
     141                 :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
     142                 :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
     143                 :  * for buffers not belonging to tablespaces that have their
     144                 :  * effective_io_concurrency parameter set.
     145                 :  */
     146                 : int         effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
     147                 : 
     148                 : /*
     149                 :  * Like effective_io_concurrency, but used by maintenance code paths that might
     150                 :  * benefit from a higher setting because they work on behalf of many sessions.
     151                 :  * Overridden by the tablespace setting of the same name.
     152                 :  */
     153                 : int         maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
     154                 : 
     155                 : /*
     156                 :  * GUC variables about triggering kernel writeback for buffers written; OS
     157                 :  * dependent defaults are set via the GUC mechanism.
     158                 :  */
     159                 : int         checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
     160                 : int         bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
     161                 : int         backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
     162                 : 
     163                 : /* local state for LockBufferForCleanup */
     164                 : static BufferDesc *PinCountWaitBuf = NULL;
     165                 : 
     166                 : /*
     167                 :  * Backend-Private refcount management:
     168                 :  *
     169                 :  * Each buffer also has a private refcount that keeps track of the number of
     170                 :  * times the buffer is pinned in the current process.  This is so that the
     171                 :  * shared refcount needs to be modified only once if a buffer is pinned more
     172                 :  * than once by an individual backend.  It's also used to check that no buffers
     173                 :  * are still pinned at the end of transactions and when exiting.
     174                 :  *
     175                 :  *
     176                 :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
     177                 :  * track of local buffers, we use a small sequentially searched array
     178                 :  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
     179                 :  * keep track of backend local pins.
     180                 :  *
     181                 :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
     182                 :  * refcounts are kept track of in the array; after that, new array entries
     183                 :  * displace old ones into the hash table. That way a frequently used entry
     184                 :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
     185                 :  *
     186                 :  * Note that in most scenarios the number of pinned buffers will not exceed
     187                 :  * REFCOUNT_ARRAY_ENTRIES.
     188                 :  *
     189                 :  *
     190                 :  * To enter a buffer into the refcount tracking mechanism first reserve a free
     191                 :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
     192                 :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
     193                 :  * memory allocations in NewPrivateRefCountEntry() which can be important
     194                 :  * because in some scenarios it's called with a spinlock held...
     195                 :  */
     196                 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
     197                 : static HTAB *PrivateRefCountHash = NULL;
     198                 : static int32 PrivateRefCountOverflowed = 0;
     199                 : static uint32 PrivateRefCountClock = 0;
     200                 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
     201                 : 
     202                 : static void ReservePrivateRefCountEntry(void);
     203                 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
     204                 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
     205                 : static inline int32 GetPrivateRefCount(Buffer buffer);
     206                 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
     207                 : 
     208                 : /*
     209                 :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
     210                 :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
     211 ECB             :  * a new entry - but it's perfectly fine to not use a reserved entry.
     212                 :  */
     213                 : static void
     214 CBC    69663558 : ReservePrivateRefCountEntry(void)
     215 ECB             : {
     216                 :     /* Already reserved (or freed), nothing to do */
     217 GIC    69663558 :     if (ReservedRefCountEntry != NULL)
     218        66674250 :         return;
     219                 : 
     220                 :     /*
     221                 :      * First search for a free entry the array, that'll be sufficient in the
     222                 :      * majority of cases.
     223                 :      */
     224 ECB             :     {
     225                 :         int         i;
     226                 : 
     227 GIC     6434020 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     228 ECB             :         {
     229                 :             PrivateRefCountEntry *res;
     230                 : 
     231 GIC     6387812 :             res = &PrivateRefCountArray[i];
     232 ECB             : 
     233 CBC     6387812 :             if (res->buffer == InvalidBuffer)
     234                 :             {
     235 GIC     2943100 :                 ReservedRefCountEntry = res;
     236         2943100 :                 return;
     237                 :             }
     238                 :         }
     239                 :     }
     240                 : 
     241                 :     /*
     242                 :      * No luck. All array entries are full. Move one array entry into the hash
     243                 :      * table.
     244                 :      */
     245                 :     {
     246                 :         /*
     247                 :          * Move entry from the current clock position in the array into the
     248                 :          * hashtable. Use that slot.
     249                 :          */
     250                 :         PrivateRefCountEntry *hashent;
     251 ECB             :         bool        found;
     252                 : 
     253                 :         /* select victim slot */
     254 GIC       46208 :         ReservedRefCountEntry =
     255 CBC       46208 :             &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
     256                 : 
     257                 :         /* Better be used, otherwise we shouldn't get here. */
     258           46208 :         Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
     259 ECB             : 
     260                 :         /* enter victim array entry into hashtable */
     261 GIC       46208 :         hashent = hash_search(PrivateRefCountHash,
     262 GNC       46208 :                               &(ReservedRefCountEntry->buffer),
     263 ECB             :                               HASH_ENTER,
     264                 :                               &found);
     265 GIC       46208 :         Assert(!found);
     266 CBC       46208 :         hashent->refcount = ReservedRefCountEntry->refcount;
     267 ECB             : 
     268                 :         /* clear the now free array slot */
     269 CBC       46208 :         ReservedRefCountEntry->buffer = InvalidBuffer;
     270 GIC       46208 :         ReservedRefCountEntry->refcount = 0;
     271                 : 
     272           46208 :         PrivateRefCountOverflowed++;
     273                 :     }
     274                 : }
     275                 : 
     276                 : /*
     277 ECB             :  * Fill a previously reserved refcount entry.
     278                 :  */
     279                 : static PrivateRefCountEntry *
     280 GIC    68472110 : NewPrivateRefCountEntry(Buffer buffer)
     281                 : {
     282 ECB             :     PrivateRefCountEntry *res;
     283                 : 
     284                 :     /* only allowed to be called when a reservation has been made */
     285 CBC    68472110 :     Assert(ReservedRefCountEntry != NULL);
     286 ECB             : 
     287                 :     /* use up the reserved entry */
     288 GIC    68472110 :     res = ReservedRefCountEntry;
     289 CBC    68472110 :     ReservedRefCountEntry = NULL;
     290 ECB             : 
     291                 :     /* and fill it */
     292 CBC    68472110 :     res->buffer = buffer;
     293 GIC    68472110 :     res->refcount = 0;
     294                 : 
     295        68472110 :     return res;
     296                 : }
     297                 : 
     298                 : /*
     299                 :  * Return the PrivateRefCount entry for the passed buffer.
     300                 :  *
     301                 :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
     302                 :  * do_move is true, and the entry resides in the hashtable the entry is
     303 ECB             :  * optimized for frequent access by moving it to the array.
     304                 :  */
     305                 : static PrivateRefCountEntry *
     306 GIC   574705670 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
     307                 : {
     308 ECB             :     PrivateRefCountEntry *res;
     309                 :     int         i;
     310                 : 
     311 GIC   574705670 :     Assert(BufferIsValid(buffer));
     312       574705670 :     Assert(!BufferIsLocal(buffer));
     313                 : 
     314                 :     /*
     315 ECB             :      * First search for references in the array, that'll be sufficient in the
     316                 :      * majority of cases.
     317                 :      */
     318 GIC  1583180876 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     319 ECB             :     {
     320 CBC  1514073662 :         res = &PrivateRefCountArray[i];
     321                 : 
     322 GIC  1514073662 :         if (res->buffer == buffer)
     323       505598456 :             return res;
     324                 :     }
     325                 : 
     326                 :     /*
     327                 :      * By here we know that the buffer, if already pinned, isn't residing in
     328                 :      * the array.
     329                 :      *
     330 ECB             :      * Only look up the buffer in the hashtable if we've previously overflowed
     331                 :      * into it.
     332                 :      */
     333 CBC    69107214 :     if (PrivateRefCountOverflowed == 0)
     334 GIC    68758477 :         return NULL;
     335 ECB             : 
     336 GNC      348737 :     res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
     337 ECB             : 
     338 GIC      348737 :     if (res == NULL)
     339          137246 :         return NULL;
     340          211491 :     else if (!do_move)
     341                 :     {
     342                 :         /* caller doesn't want us to move the hash entry into the array */
     343          211254 :         return res;
     344                 :     }
     345                 :     else
     346 ECB             :     {
     347                 :         /* move buffer from hashtable into the free array slot */
     348                 :         bool        found;
     349                 :         PrivateRefCountEntry *free;
     350                 : 
     351                 :         /* Ensure there's a free array slot */
     352 CBC         237 :         ReservePrivateRefCountEntry();
     353                 : 
     354                 :         /* Use up the reserved slot */
     355             237 :         Assert(ReservedRefCountEntry != NULL);
     356             237 :         free = ReservedRefCountEntry;
     357 GIC         237 :         ReservedRefCountEntry = NULL;
     358             237 :         Assert(free->buffer == InvalidBuffer);
     359 ECB             : 
     360                 :         /* and fill it */
     361 CBC         237 :         free->buffer = buffer;
     362             237 :         free->refcount = res->refcount;
     363                 : 
     364 ECB             :         /* delete from hashtable */
     365 GNC         237 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     366 GIC         237 :         Assert(found);
     367             237 :         Assert(PrivateRefCountOverflowed > 0);
     368             237 :         PrivateRefCountOverflowed--;
     369                 : 
     370             237 :         return free;
     371 ECB             :     }
     372                 : }
     373                 : 
     374                 : /*
     375                 :  * Returns how many times the passed buffer is pinned by this backend.
     376                 :  *
     377                 :  * Only works for shared memory buffers!
     378                 :  */
     379                 : static inline int32
     380 GIC   411969948 : GetPrivateRefCount(Buffer buffer)
     381                 : {
     382 ECB             :     PrivateRefCountEntry *ref;
     383                 : 
     384 CBC   411969948 :     Assert(BufferIsValid(buffer));
     385       411969948 :     Assert(!BufferIsLocal(buffer));
     386 ECB             : 
     387                 :     /*
     388                 :      * Not moving the entry - that's ok for the current users, but we might
     389                 :      * want to change this one day.
     390                 :      */
     391 GIC   411969948 :     ref = GetPrivateRefCountEntry(buffer, false);
     392                 : 
     393       411969948 :     if (ref == NULL)
     394 CBC      423613 :         return 0;
     395 GIC   411546335 :     return ref->refcount;
     396 ECB             : }
     397                 : 
     398                 : /*
     399                 :  * Release resources used to track the reference count of a buffer which we no
     400                 :  * longer have pinned and don't want to pin again immediately.
     401                 :  */
     402                 : static void
     403 GIC    68472110 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
     404                 : {
     405        68472110 :     Assert(ref->refcount == 0);
     406                 : 
     407        68472110 :     if (ref >= &PrivateRefCountArray[0] &&
     408 ECB             :         ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
     409                 :     {
     410 GIC    68426139 :         ref->buffer = InvalidBuffer;
     411                 : 
     412                 :         /*
     413 ECB             :          * Mark the just used entry as reserved - in many scenarios that
     414                 :          * allows us to avoid ever having to search the array/hash for free
     415                 :          * entries.
     416                 :          */
     417 CBC    68426139 :         ReservedRefCountEntry = ref;
     418 ECB             :     }
     419                 :     else
     420                 :     {
     421                 :         bool        found;
     422 GIC       45971 :         Buffer      buffer = ref->buffer;
     423                 : 
     424 GNC       45971 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     425 GIC       45971 :         Assert(found);
     426           45971 :         Assert(PrivateRefCountOverflowed > 0);
     427           45971 :         PrivateRefCountOverflowed--;
     428                 :     }
     429        68472110 : }
     430                 : 
     431                 : /*
     432                 :  * BufferIsPinned
     433                 :  *      True iff the buffer is pinned (also checks for valid buffer number).
     434                 :  *
     435                 :  *      NOTE: what we check here is that *this* backend holds a pin on
     436                 :  *      the buffer.  We do not care whether some other backend does.
     437                 :  */
     438                 : #define BufferIsPinned(bufnum) \
     439                 : ( \
     440                 :     !BufferIsValid(bufnum) ? \
     441                 :         false \
     442                 :     : \
     443                 :         BufferIsLocal(bufnum) ? \
     444                 :             (LocalRefCount[-(bufnum) - 1] > 0) \
     445                 :         : \
     446                 :     (GetPrivateRefCount(bufnum) > 0) \
     447                 : )
     448                 : 
     449                 : 
     450                 : static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
     451                 :                                 ForkNumber forkNum, BlockNumber blockNum,
     452                 :                                 ReadBufferMode mode, BufferAccessStrategy strategy,
     453                 :                                 bool *hit);
     454                 : static BlockNumber ExtendBufferedRelCommon(ExtendBufferedWhat eb,
     455                 :                                            ForkNumber fork,
     456                 :                                            BufferAccessStrategy strategy,
     457                 :                                            uint32 flags,
     458                 :                                            uint32 extend_by,
     459                 :                                            BlockNumber extend_upto,
     460                 :                                            Buffer *buffers,
     461                 :                                            uint32 *extended_by);
     462                 : static BlockNumber ExtendBufferedRelShared(ExtendBufferedWhat eb,
     463                 :                                            ForkNumber fork,
     464                 :                                            BufferAccessStrategy strategy,
     465                 :                                            uint32 flags,
     466                 :                                            uint32 extend_by,
     467                 :                                            BlockNumber extend_upto,
     468                 :                                            Buffer *buffers,
     469                 :                                            uint32 *extended_by);
     470                 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
     471                 : static void PinBuffer_Locked(BufferDesc *buf);
     472                 : static void UnpinBuffer(BufferDesc *buf);
     473                 : static void BufferSync(int flags);
     474                 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
     475                 : static int  SyncOneBuffer(int buf_id, bool skip_recently_used,
     476                 :                           WritebackContext *wb_context);
     477                 : static void WaitIO(BufferDesc *buf);
     478                 : static bool StartBufferIO(BufferDesc *buf, bool forInput);
     479                 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
     480                 :                               uint32 set_flag_bits);
     481                 : static void shared_buffer_write_error_callback(void *arg);
     482                 : static void local_buffer_write_error_callback(void *arg);
     483                 : static BufferDesc *BufferAlloc(SMgrRelation smgr,
     484                 :                                char relpersistence,
     485                 :                                ForkNumber forkNum,
     486                 :                                BlockNumber blockNum,
     487                 :                                BufferAccessStrategy strategy,
     488                 :                                bool *foundPtr, IOContext io_context);
     489                 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
     490                 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
     491                 :                         IOObject io_object, IOContext io_context);
     492                 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
     493                 :                                        ForkNumber forkNum,
     494                 :                                        BlockNumber nForkBlock,
     495                 :                                        BlockNumber firstDelBlock);
     496                 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
     497                 :                                            RelFileLocator dstlocator,
     498                 :                                            ForkNumber forkNum, bool permanent);
     499                 : static void AtProcExit_Buffers(int code, Datum arg);
     500                 : static void CheckForBufferLeaks(void);
     501                 : static int  rlocator_comparator(const void *p1, const void *p2);
     502                 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
     503                 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
     504                 : static int  ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
     505                 : 
     506                 : 
     507                 : /*
     508                 :  * Implementation of PrefetchBuffer() for shared buffers.
     509                 :  */
     510                 : PrefetchBufferResult
     511          878497 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
     512                 :                      ForkNumber forkNum,
     513                 :                      BlockNumber blockNum)
     514                 : {
     515          878497 :     PrefetchBufferResult result = {InvalidBuffer, false};
     516                 :     BufferTag   newTag;         /* identity of requested block */
     517 ECB             :     uint32      newHash;        /* hash value for newTag */
     518                 :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
     519                 :     int         buf_id;
     520                 : 
     521 CBC      878497 :     Assert(BlockNumberIsValid(blockNum));
     522                 : 
     523                 :     /* create a tag so we can lookup the buffer */
     524 GNC      878497 :     InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
     525                 :                   forkNum, blockNum);
     526                 : 
     527 ECB             :     /* determine its hash code and partition lock ID */
     528 GIC      878497 :     newHash = BufTableHashCode(&newTag);
     529          878497 :     newPartitionLock = BufMappingPartitionLock(newHash);
     530 ECB             : 
     531                 :     /* see if the block is in the buffer pool already */
     532 GIC      878497 :     LWLockAcquire(newPartitionLock, LW_SHARED);
     533          878497 :     buf_id = BufTableLookup(&newTag, newHash);
     534 CBC      878497 :     LWLockRelease(newPartitionLock);
     535 ECB             : 
     536                 :     /* If not in buffers, initiate prefetch */
     537 GIC      878497 :     if (buf_id < 0)
     538 ECB             :     {
     539                 : #ifdef USE_PREFETCH
     540                 :         /*
     541                 :          * Try to initiate an asynchronous read.  This returns false in
     542                 :          * recovery if the relation file doesn't exist.
     543                 :          */
     544 GNC      424364 :         if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
     545          212103 :             smgrprefetch(smgr_reln, forkNum, blockNum))
     546                 :         {
     547 GIC      212103 :             result.initiated_io = true;
     548                 :         }
     549                 : #endif                          /* USE_PREFETCH */
     550                 :     }
     551                 :     else
     552                 :     {
     553 ECB             :         /*
     554                 :          * Report the buffer it was in at that time.  The caller may be able
     555                 :          * to avoid a buffer table lookup, but it's not pinned and it must be
     556                 :          * rechecked!
     557                 :          */
     558 GIC      666236 :         result.recent_buffer = buf_id + 1;
     559                 :     }
     560                 : 
     561                 :     /*
     562                 :      * If the block *is* in buffers, we do nothing.  This is not really ideal:
     563                 :      * the block might be just about to be evicted, which would be stupid
     564                 :      * since we know we are going to need it soon.  But the only easy answer
     565                 :      * is to bump the usage_count, which does not seem like a great solution:
     566                 :      * when the caller does ultimately touch the block, usage_count would get
     567 ECB             :      * bumped again, resulting in too much favoritism for blocks that are
     568                 :      * involved in a prefetch sequence. A real fix would involve some
     569                 :      * additional per-buffer state, and it's not clear that there's enough of
     570                 :      * a problem to justify that.
     571                 :      */
     572                 : 
     573 GIC      878497 :     return result;
     574                 : }
     575                 : 
     576                 : /*
     577                 :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
     578                 :  *
     579                 :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
     580                 :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
     581                 :  * block will not be delayed by the I/O.  Prefetching is optional.
     582 ECB             :  *
     583                 :  * There are three possible outcomes:
     584                 :  *
     585                 :  * 1.  If the block is already cached, the result includes a valid buffer that
     586                 :  * could be used by the caller to avoid the need for a later buffer lookup, but
     587                 :  * it's not pinned, so the caller must recheck it.
     588                 :  *
     589                 :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
     590                 :  * true.  Currently there is no way to know if the data was already cached by
     591                 :  * the kernel and therefore didn't really initiate I/O, and no way to know when
     592                 :  * the I/O completes other than using synchronous ReadBuffer().
     593                 :  *
     594                 :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
     595                 :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
     596                 :  * lack of a kernel facility), direct I/O is enabled, or the underlying
     597                 :  * relation file wasn't found and we are in recovery.  (If the relation file
     598                 :  * wasn't found and we are not in recovery, an error is raised).
     599                 :  */
     600                 : PrefetchBufferResult
     601 GIC      420321 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
     602                 : {
     603          420321 :     Assert(RelationIsValid(reln));
     604          420321 :     Assert(BlockNumberIsValid(blockNum));
     605                 : 
     606          420321 :     if (RelationUsesLocalBuffers(reln))
     607                 :     {
     608                 :         /* see comments in ReadBufferExtended */
     609            6244 :         if (RELATION_IS_OTHER_TEMP(reln))
     610 LBC           0 :             ereport(ERROR,
     611                 :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     612 ECB             :                      errmsg("cannot access temporary tables of other sessions")));
     613                 : 
     614                 :         /* pass it off to localbuf.c */
     615 CBC        6244 :         return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     616                 :     }
     617                 :     else
     618 ECB             :     {
     619 EUB             :         /* pass it to the shared buffer version */
     620 GIC      414077 :         return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     621                 :     }
     622                 : }
     623                 : 
     624 ECB             : /*
     625                 :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
     626                 :  *
     627                 :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
     628                 :  * successful.  Return true if the buffer is valid and still has the expected
     629                 :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
     630                 :  */
     631                 : bool
     632 GNC      423615 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
     633                 :                  Buffer recent_buffer)
     634                 : {
     635                 :     BufferDesc *bufHdr;
     636                 :     BufferTag   tag;
     637                 :     uint32      buf_state;
     638                 :     bool        have_private_ref;
     639                 : 
     640 GIC      423615 :     Assert(BufferIsValid(recent_buffer));
     641 ECB             : 
     642 GIC      423615 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
     643          423615 :     ReservePrivateRefCountEntry();
     644 GNC      423615 :     InitBufferTag(&tag, &rlocator, forkNum, blockNum);
     645                 : 
     646 GIC      423615 :     if (BufferIsLocal(recent_buffer))
     647                 :     {
     648 UIC           0 :         int         b = -recent_buffer - 1;
     649 ECB             : 
     650 UIC           0 :         bufHdr = GetLocalBufferDescriptor(b);
     651 LBC           0 :         buf_state = pg_atomic_read_u32(&bufHdr->state);
     652 ECB             : 
     653                 :         /* Is it still valid and holding the right tag? */
     654 UNC           0 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
     655 ECB             :         {
     656 UNC           0 :             PinLocalBuffer(bufHdr, true);
     657                 : 
     658 UIC           0 :             pgBufferUsage.local_blks_hit++;
     659                 : 
     660               0 :             return true;
     661 ECB             :         }
     662                 :     }
     663                 :     else
     664                 :     {
     665 GIC      423615 :         bufHdr = GetBufferDescriptor(recent_buffer - 1);
     666          423615 :         have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
     667                 : 
     668                 :         /*
     669 ECB             :          * Do we already have this buffer pinned with a private reference?  If
     670                 :          * so, it must be valid and it is safe to check the tag without
     671                 :          * locking.  If not, we have to lock the header first and then check.
     672                 :          */
     673 GIC      423615 :         if (have_private_ref)
     674 CBC           4 :             buf_state = pg_atomic_read_u32(&bufHdr->state);
     675                 :         else
     676 GIC      423611 :             buf_state = LockBufHdr(bufHdr);
     677                 : 
     678 GNC      423615 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
     679                 :         {
     680                 :             /*
     681 ECB             :              * It's now safe to pin the buffer.  We can't pin first and ask
     682 EUB             :              * questions later, because it might confuse code paths like
     683                 :              * InvalidateBuffer() if we pinned a random non-matching buffer.
     684 ECB             :              */
     685 GIC      422006 :             if (have_private_ref)
     686 LBC           0 :                 PinBuffer(bufHdr, NULL);    /* bump pin count */
     687                 :             else
     688 CBC      422006 :                 PinBuffer_Locked(bufHdr);   /* pin for first time */
     689                 : 
     690 GIC      422006 :             pgBufferUsage.shared_blks_hit++;
     691                 : 
     692 CBC      422006 :             return true;
     693 ECB             :         }
     694                 : 
     695                 :         /* If we locked the header above, now unlock. */
     696 CBC        1609 :         if (!have_private_ref)
     697 GIC        1605 :             UnlockBufHdr(bufHdr, buf_state);
     698                 :     }
     699                 : 
     700            1609 :     return false;
     701                 : }
     702                 : 
     703                 : /*
     704 ECB             :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
     705                 :  *      fork with RBM_NORMAL mode and default strategy.
     706                 :  */
     707                 : Buffer
     708 GIC    50153010 : ReadBuffer(Relation reln, BlockNumber blockNum)
     709                 : {
     710        50153010 :     return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
     711                 : }
     712                 : 
     713                 : /*
     714                 :  * ReadBufferExtended -- returns a buffer containing the requested
     715                 :  *      block of the requested relation.  If the blknum
     716                 :  *      requested is P_NEW, extend the relation file and
     717                 :  *      allocate a new block.  (Caller is responsible for
     718                 :  *      ensuring that only one backend tries to extend a
     719                 :  *      relation at the same time!)
     720                 :  *
     721                 :  * Returns: the buffer number for the buffer containing
     722                 :  *      the block read.  The returned buffer has been pinned.
     723                 :  *      Does not return on error --- elog's instead.
     724                 :  *
     725                 :  * Assume when this function is called, that reln has been opened already.
     726                 :  *
     727                 :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
     728                 :  * validated.  An error is thrown if the page header is not valid.  (But
     729                 :  * note that an all-zero page is considered "valid"; see
     730                 :  * PageIsVerifiedExtended().)
     731                 :  *
     732                 :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
     733                 :  * valid, the page is zeroed instead of throwing an error. This is intended
     734                 :  * for non-critical data, where the caller is prepared to repair errors.
     735                 :  *
     736                 :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
     737                 :  * filled with zeros instead of reading it from disk.  Useful when the caller
     738                 :  * is going to fill the page from scratch, since this saves I/O and avoids
     739                 :  * unnecessary failure if the page-on-disk has corrupt page headers.
     740                 :  * The page is returned locked to ensure that the caller has a chance to
     741                 :  * initialize the page before it's made visible to others.
     742                 :  * Caution: do not use this mode to read a page that is beyond the relation's
     743                 :  * current physical EOF; that is likely to cause problems in md.c when
     744                 :  * the page is modified and written out. P_NEW is OK, though.
     745                 :  *
     746                 :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
     747                 :  * a cleanup-strength lock on the page.
     748                 :  *
     749                 :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
     750                 :  *
     751 ECB             :  * If strategy is not NULL, a nondefault buffer access strategy is used.
     752                 :  * See buffer/README for details.
     753                 :  */
     754                 : Buffer
     755 GIC    67680696 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
     756                 :                    ReadBufferMode mode, BufferAccessStrategy strategy)
     757                 : {
     758                 :     bool        hit;
     759                 :     Buffer      buf;
     760                 : 
     761                 :     /*
     762 ECB             :      * Reject attempts to read non-local temporary relations; we would be
     763 EUB             :      * likely to get wrong data since we have no visibility into the owning
     764                 :      * session's local buffers.
     765                 :      */
     766 GIC    67680696 :     if (RELATION_IS_OTHER_TEMP(reln))
     767 UIC           0 :         ereport(ERROR,
     768                 :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     769                 :                  errmsg("cannot access temporary tables of other sessions")));
     770                 : 
     771 ECB             :     /*
     772                 :      * Read the buffer, and update pgstat counters to reflect a cache hit or
     773                 :      * miss.
     774                 :      */
     775 CBC    67680696 :     pgstat_count_buffer_read(reln);
     776        67680696 :     buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
     777                 :                             forkNum, blockNum, mode, strategy, &hit);
     778 GIC    67680683 :     if (hit)
     779        66529542 :         pgstat_count_buffer_hit(reln);
     780        67680683 :     return buf;
     781                 : }
     782                 : 
     783                 : 
     784                 : /*
     785                 :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
     786                 :  *      a relcache entry for the relation.
     787                 :  *
     788                 :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
     789                 :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
     790                 :  * cannot be used for temporary relations (and making that work might be
     791 ECB             :  * difficult, unless we only want to read temporary relations for our own
     792                 :  * BackendId).
     793                 :  */
     794                 : Buffer
     795 GNC     2815054 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
     796                 :                           BlockNumber blockNum, ReadBufferMode mode,
     797 ECB             :                           BufferAccessStrategy strategy, bool permanent)
     798                 : {
     799                 :     bool        hit;
     800                 : 
     801 GNC     2815054 :     SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
     802                 : 
     803 GIC     2815054 :     return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
     804                 :                              RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
     805                 :                              mode, strategy, &hit);
     806                 : }
     807                 : 
     808                 : /*
     809                 :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
     810                 :  */
     811                 : Buffer
     812 GNC       62012 : ExtendBufferedRel(ExtendBufferedWhat eb,
     813                 :                   ForkNumber forkNum,
     814                 :                   BufferAccessStrategy strategy,
     815                 :                   uint32 flags)
     816                 : {
     817                 :     Buffer      buf;
     818           62012 :     uint32      extend_by = 1;
     819                 : 
     820           62012 :     ExtendBufferedRelBy(eb, forkNum, strategy, flags, extend_by,
     821                 :                         &buf, &extend_by);
     822                 : 
     823           62012 :     return buf;
     824                 : }
     825                 : 
     826                 : /*
     827                 :  * Extend relation by multiple blocks.
     828                 :  *
     829                 :  * Tries to extend the relation by extend_by blocks. Depending on the
     830                 :  * availability of resources the relation may end up being extended by a
     831                 :  * smaller number of pages (unless an error is thrown, always by at least one
     832                 :  * page). *extended_by is updated to the number of pages the relation has been
     833                 :  * extended to.
     834                 :  *
     835                 :  * buffers needs to be an array that is at least extend_by long. Upon
     836                 :  * completion, the first extend_by array elements will point to a pinned
     837                 :  * buffer.
     838                 :  *
     839                 :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
     840                 :  * locked. This is useful for callers that want a buffer that is guaranteed to
     841                 :  * be empty.
     842                 :  */
     843                 : BlockNumber
     844          277364 : ExtendBufferedRelBy(ExtendBufferedWhat eb,
     845                 :                     ForkNumber fork,
     846                 :                     BufferAccessStrategy strategy,
     847                 :                     uint32 flags,
     848                 :                     uint32 extend_by,
     849                 :                     Buffer *buffers,
     850                 :                     uint32 *extended_by)
     851                 : {
     852          277364 :     Assert((eb.rel != NULL) != (eb.smgr != NULL));
     853          277364 :     Assert(eb.smgr == NULL || eb.relpersistence != 0);
     854          277364 :     Assert(extend_by > 0);
     855                 : 
     856          277364 :     if (eb.smgr == NULL)
     857                 :     {
     858          277177 :         eb.smgr = RelationGetSmgr(eb.rel);
     859          277177 :         eb.relpersistence = eb.rel->rd_rel->relpersistence;
     860                 :     }
     861                 : 
     862          277364 :     return ExtendBufferedRelCommon(eb, fork, strategy, flags,
     863                 :                                    extend_by, InvalidBlockNumber,
     864                 :                                    buffers, extended_by);
     865                 : }
     866                 : 
     867                 : /*
     868                 :  * Extend the relation so it is at least extend_to blocks large, return buffer
     869                 :  * (extend_to - 1).
     870                 :  *
     871                 :  * This is useful for callers that want to write a specific page, regardless
     872                 :  * of the current size of the relation (e.g. useful for visibilitymap and for
     873                 :  * crash recovery).
     874                 :  */
     875                 : Buffer
     876           64243 : ExtendBufferedRelTo(ExtendBufferedWhat eb,
     877                 :                     ForkNumber fork,
     878                 :                     BufferAccessStrategy strategy,
     879                 :                     uint32 flags,
     880                 :                     BlockNumber extend_to,
     881                 :                     ReadBufferMode mode)
     882                 : {
     883                 :     BlockNumber current_size;
     884           64243 :     uint32      extended_by = 0;
     885           64243 :     Buffer      buffer = InvalidBuffer;
     886                 :     Buffer      buffers[64];
     887                 : 
     888           64243 :     Assert((eb.rel != NULL) != (eb.smgr != NULL));
     889           64243 :     Assert(eb.smgr == NULL || eb.relpersistence != 0);
     890           64243 :     Assert(extend_to != InvalidBlockNumber && extend_to > 0);
     891           64243 :     Assert(mode == RBM_NORMAL || mode == RBM_ZERO_ON_ERROR ||
     892                 :            mode == RBM_ZERO_AND_LOCK);
     893                 : 
     894           64243 :     if (eb.smgr == NULL)
     895                 :     {
     896           27870 :         eb.smgr = RelationGetSmgr(eb.rel);
     897           27870 :         eb.relpersistence = eb.rel->rd_rel->relpersistence;
     898                 :     }
     899                 : 
     900                 :     /*
     901                 :      * If desired, create the file if it doesn't exist.  If
     902                 :      * smgr_cached_nblocks[fork] is positive then it must exist, no need for
     903                 :      * an smgrexists call.
     904                 :      */
     905           64243 :     if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
     906           27870 :         (eb.smgr->smgr_cached_nblocks[fork] == 0 ||
     907              10 :          eb.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
     908           27860 :         !smgrexists(eb.smgr, fork))
     909                 :     {
     910           27855 :         LockRelationForExtension(eb.rel, ExclusiveLock);
     911                 : 
     912                 :         /* could have been closed while waiting for lock */
     913           27855 :         if (eb.rel)
     914           27855 :             eb.smgr = RelationGetSmgr(eb.rel);
     915                 : 
     916                 :         /* recheck, fork might have been created concurrently */
     917           27855 :         if (!smgrexists(eb.smgr, fork))
     918           27854 :             smgrcreate(eb.smgr, fork, flags & EB_PERFORMING_RECOVERY);
     919                 : 
     920           27855 :         UnlockRelationForExtension(eb.rel, ExclusiveLock);
     921                 :     }
     922                 : 
     923                 :     /*
     924                 :      * If requested, invalidate size cache, so that smgrnblocks asks the
     925                 :      * kernel.
     926                 :      */
     927           64243 :     if (flags & EB_CLEAR_SIZE_CACHE)
     928           27870 :         eb.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
     929                 : 
     930                 :     /*
     931                 :      * Estimate how many pages we'll need to extend by. This avoids acquiring
     932                 :      * unnecessarily many victim buffers.
     933                 :      */
     934           64243 :     current_size = smgrnblocks(eb.smgr, fork);
     935                 : 
     936           64243 :     if (mode == RBM_ZERO_AND_LOCK)
     937           36056 :         flags |= EB_LOCK_TARGET;
     938                 : 
     939          130664 :     while (current_size < extend_to)
     940                 :     {
     941           66421 :         uint32      num_pages = lengthof(buffers);
     942                 :         BlockNumber first_block;
     943                 : 
     944           66421 :         if ((uint64) current_size + num_pages > extend_to)
     945           66355 :             num_pages = extend_to - current_size;
     946                 : 
     947           66421 :         first_block = ExtendBufferedRelCommon(eb, fork, strategy, flags,
     948                 :                                               num_pages, extend_to,
     949                 :                                               buffers, &extended_by);
     950                 : 
     951           66421 :         current_size = first_block + extended_by;
     952           66421 :         Assert(current_size <= extend_to);
     953           66421 :         Assert(num_pages != 0 || current_size >= extend_to);
     954                 : 
     955          160982 :         for (int i = 0; i < extended_by; i++)
     956                 :         {
     957           94561 :             if (first_block + i != extend_to - 1)
     958           30319 :                 ReleaseBuffer(buffers[i]);
     959                 :             else
     960           64242 :                 buffer = buffers[i];
     961                 :         }
     962                 :     }
     963                 : 
     964                 :     /*
     965                 :      * It's possible that another backend concurrently extended the relation.
     966                 :      * In that case read the buffer.
     967                 :      *
     968                 :      * XXX: Should we control this via a flag?
     969                 :      */
     970           64243 :     if (buffer == InvalidBuffer)
     971                 :     {
     972                 :         bool        hit;
     973                 : 
     974               1 :         Assert(extended_by == 0);
     975               1 :         buffer = ReadBuffer_common(eb.smgr, eb.relpersistence,
     976                 :                                    fork, extend_to - 1, mode, strategy,
     977                 :                                    &hit);
     978                 :     }
     979                 : 
     980           64243 :     return buffer;
     981                 : }
     982 ECB             : 
     983                 : /*
     984                 :  * ReadBuffer_common -- common logic for all ReadBuffer variants
     985                 :  *
     986                 :  * *hit is set to true if the request was satisfied from shared buffer cache.
     987                 :  */
     988                 : static Buffer
     989 GIC    70495751 : ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
     990 ECB             :                   BlockNumber blockNum, ReadBufferMode mode,
     991                 :                   BufferAccessStrategy strategy, bool *hit)
     992                 : {
     993                 :     BufferDesc *bufHdr;
     994                 :     Block       bufBlock;
     995                 :     bool        found;
     996                 :     IOContext   io_context;
     997                 :     IOObject    io_object;
     998 GIC    70495751 :     bool        isLocalBuf = SmgrIsTemp(smgr);
     999                 : 
    1000        70495751 :     *hit = false;
    1001                 : 
    1002                 :     /*
    1003                 :      * Backward compatibility path, most code should use ExtendBufferedRel()
    1004                 :      * instead, as acquiring the extension lock inside ExtendBufferedRel()
    1005                 :      * scales a lot better.
    1006                 :      */
    1007 GNC    70495751 :     if (unlikely(blockNum == P_NEW))
    1008                 :     {
    1009             187 :         uint32      flags = EB_SKIP_EXTENSION_LOCK;
    1010                 : 
    1011             187 :         Assert(mode == RBM_NORMAL ||
    1012                 :                mode == RBM_ZERO_AND_LOCK ||
    1013                 :                mode == RBM_ZERO_ON_ERROR);
    1014                 : 
    1015             187 :         if (mode == RBM_ZERO_AND_LOCK)
    1016 UNC           0 :             flags |= EB_LOCK_FIRST;
    1017                 : 
    1018 GNC         187 :         return ExtendBufferedRel(EB_SMGR(smgr, relpersistence),
    1019                 :                                  forkNum, strategy, flags);
    1020                 :     }
    1021                 : 
    1022                 :     /* Make sure we will have room to remember the buffer pin */
    1023 GIC    70495564 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    1024                 : 
    1025                 :     TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
    1026                 :                                        smgr->smgr_rlocator.locator.spcOid,
    1027                 :                                        smgr->smgr_rlocator.locator.dbOid,
    1028                 :                                        smgr->smgr_rlocator.locator.relNumber,
    1029                 :                                        smgr->smgr_rlocator.backend);
    1030                 : 
    1031 CBC    70495564 :     if (isLocalBuf)
    1032                 :     {
    1033                 :         /*
    1034                 :          * We do not use a BufferAccessStrategy for I/O of temporary tables.
    1035                 :          * However, in some cases, the "strategy" may not be NULL, so we can't
    1036                 :          * rely on IOContextForStrategy() to set the right IOContext for us.
    1037                 :          * This may happen in cases like CREATE TEMPORARY TABLE AS...
    1038                 :          */
    1039 GNC     1247393 :         io_context = IOCONTEXT_NORMAL;
    1040         1247393 :         io_object = IOOBJECT_TEMP_RELATION;
    1041 CBC     1247393 :         bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
    1042         1247393 :         if (found)
    1043 GIC     1243603 :             pgBufferUsage.local_blks_hit++;
    1044            3790 :         else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
    1045                 :                  mode == RBM_ZERO_ON_ERROR)
    1046            3790 :             pgBufferUsage.local_blks_read++;
    1047                 :     }
    1048                 :     else
    1049                 :     {
    1050                 :         /*
    1051                 :          * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
    1052                 :          * not currently in memory.
    1053                 :          */
    1054 GNC    69248171 :         io_context = IOContextForStrategy(strategy);
    1055        69248171 :         io_object = IOOBJECT_RELATION;
    1056 GIC    69248171 :         bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
    1057                 :                              strategy, &found, io_context);
    1058        69248171 :         if (found)
    1059 CBC    67748583 :             pgBufferUsage.shared_blks_hit++;
    1060 GIC     1499588 :         else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
    1061                 :                  mode == RBM_ZERO_ON_ERROR)
    1062         1306022 :             pgBufferUsage.shared_blks_read++;
    1063                 :     }
    1064                 : 
    1065 ECB             :     /* At this point we do NOT hold any locks. */
    1066                 : 
    1067                 :     /* if it was already in the buffer pool, we're done */
    1068 GIC    70495564 :     if (found)
    1069 ECB             :     {
    1070                 :         /* Just need to update stats before we exit */
    1071 GNC    68992186 :         *hit = true;
    1072        68992186 :         VacuumPageHit++;
    1073        68992186 :         pgstat_count_io_op(io_object, io_context, IOOP_HIT);
    1074 ECB             : 
    1075 GNC    68992186 :         if (VacuumCostActive)
    1076           53710 :             VacuumCostBalance += VacuumCostPageHit;
    1077 ECB             : 
    1078                 :         TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
    1079                 :                                           smgr->smgr_rlocator.locator.spcOid,
    1080                 :                                           smgr->smgr_rlocator.locator.dbOid,
    1081                 :                                           smgr->smgr_rlocator.locator.relNumber,
    1082                 :                                           smgr->smgr_rlocator.backend,
    1083                 :                                           found);
    1084                 : 
    1085                 :         /*
    1086                 :          * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
    1087                 :          * on return.
    1088                 :          */
    1089 GNC    68992186 :         if (!isLocalBuf)
    1090                 :         {
    1091        67748583 :             if (mode == RBM_ZERO_AND_LOCK)
    1092           30153 :                 LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
    1093                 :                               LW_EXCLUSIVE);
    1094        67718430 :             else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1095              19 :                 LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
    1096                 :         }
    1097                 : 
    1098        68992186 :         return BufferDescriptorGetBuffer(bufHdr);
    1099                 :     }
    1100 ECB             : 
    1101                 :     /*
    1102                 :      * if we have gotten to this point, we have allocated a buffer for the
    1103                 :      * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
    1104                 :      * if it's a shared buffer.
    1105                 :      */
    1106 GIC     1503378 :     Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));    /* spinlock not needed */
    1107                 : 
    1108 CBC     1503378 :     bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
    1109                 : 
    1110                 :     /*
    1111                 :      * Read in the page, unless the caller intends to overwrite it and just
    1112                 :      * wants us to allocate a buffer.
    1113                 :      */
    1114 GNC     1503378 :     if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1115 GIC      193566 :         MemSet((char *) bufBlock, 0, BLCKSZ);
    1116                 :     else
    1117                 :     {
    1118 GNC     1309812 :         instr_time  io_start = pgstat_prepare_io_time();
    1119                 : 
    1120         1309812 :         smgrread(smgr, forkNum, blockNum, bufBlock);
    1121                 : 
    1122         1309799 :         pgstat_count_io_op_time(io_object, io_context,
    1123                 :                                 IOOP_READ, io_start, 1);
    1124                 : 
    1125                 :         /* check for garbage data */
    1126         1309799 :         if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
    1127                 :                                     PIV_LOG_WARNING | PIV_REPORT_STAT))
    1128                 :         {
    1129 UNC           0 :             if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
    1130                 :             {
    1131               0 :                 ereport(WARNING,
    1132                 :                         (errcode(ERRCODE_DATA_CORRUPTED),
    1133                 :                          errmsg("invalid page in block %u of relation %s; zeroing out page",
    1134                 :                                 blockNum,
    1135                 :                                 relpath(smgr->smgr_rlocator, forkNum))));
    1136               0 :                 MemSet((char *) bufBlock, 0, BLCKSZ);
    1137                 :             }
    1138                 :             else
    1139               0 :                 ereport(ERROR,
    1140                 :                         (errcode(ERRCODE_DATA_CORRUPTED),
    1141                 :                          errmsg("invalid page in block %u of relation %s",
    1142                 :                                 blockNum,
    1143                 :                                 relpath(smgr->smgr_rlocator, forkNum))));
    1144                 :         }
    1145                 :     }
    1146                 : 
    1147 ECB             :     /*
    1148                 :      * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
    1149                 :      * the page as valid, to make sure that no other backend sees the zeroed
    1150                 :      * page before the caller has had a chance to initialize it.
    1151                 :      *
    1152                 :      * Since no-one else can be looking at the page contents yet, there is no
    1153                 :      * difference between an exclusive lock and a cleanup-strength lock. (Note
    1154                 :      * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
    1155                 :      * they assert that the buffer is already valid.)
    1156                 :      */
    1157 GIC     1503365 :     if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
    1158          193566 :         !isLocalBuf)
    1159                 :     {
    1160          193566 :         LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
    1161 ECB             :     }
    1162                 : 
    1163 GIC     1503365 :     if (isLocalBuf)
    1164 ECB             :     {
    1165                 :         /* Only need to adjust flags */
    1166 CBC        3790 :         uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
    1167                 : 
    1168            3790 :         buf_state |= BM_VALID;
    1169            3790 :         pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
    1170                 :     }
    1171                 :     else
    1172                 :     {
    1173                 :         /* Set BM_VALID, terminate IO, and wake up any waiters */
    1174 GIC     1499575 :         TerminateBufferIO(bufHdr, false, BM_VALID);
    1175                 :     }
    1176                 : 
    1177         1503365 :     VacuumPageMiss++;
    1178         1503365 :     if (VacuumCostActive)
    1179             616 :         VacuumCostBalance += VacuumCostPageMiss;
    1180                 : 
    1181                 :     TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
    1182                 :                                       smgr->smgr_rlocator.locator.spcOid,
    1183                 :                                       smgr->smgr_rlocator.locator.dbOid,
    1184                 :                                       smgr->smgr_rlocator.locator.relNumber,
    1185                 :                                       smgr->smgr_rlocator.backend,
    1186 ECB             :                                       found);
    1187                 : 
    1188 GIC     1503365 :     return BufferDescriptorGetBuffer(bufHdr);
    1189                 : }
    1190 ECB             : 
    1191                 : /*
    1192                 :  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
    1193                 :  *      buffer.  If no buffer exists already, selects a replacement
    1194                 :  *      victim and evicts the old page, but does NOT read in new page.
    1195                 :  *
    1196                 :  * "strategy" can be a buffer replacement strategy object, or NULL for
    1197                 :  * the default strategy.  The selected buffer's usage_count is advanced when
    1198                 :  * using the default strategy, but otherwise possibly not (see PinBuffer).
    1199                 :  *
    1200                 :  * The returned buffer is pinned and is already marked as holding the
    1201                 :  * desired page.  If it already did have the desired page, *foundPtr is
    1202                 :  * set true.  Otherwise, *foundPtr is set false and the buffer is marked
    1203                 :  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
    1204                 :  *
    1205                 :  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
    1206                 :  * we keep it for simplicity in ReadBuffer.
    1207                 :  *
    1208                 :  * io_context is passed as an output parameter to avoid calling
    1209                 :  * IOContextForStrategy() when there is a shared buffers hit and no IO
    1210                 :  * statistics need be captured.
    1211                 :  *
    1212                 :  * No locks are held either at entry or exit.
    1213                 :  */
    1214                 : static BufferDesc *
    1215 GIC    69248171 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    1216 ECB             :             BlockNumber blockNum,
    1217                 :             BufferAccessStrategy strategy,
    1218                 :             bool *foundPtr, IOContext io_context)
    1219                 : {
    1220                 :     BufferTag   newTag;         /* identity of requested block */
    1221                 :     uint32      newHash;        /* hash value for newTag */
    1222                 :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
    1223                 :     int         existing_buf_id;
    1224                 :     Buffer      victim_buffer;
    1225                 :     BufferDesc *victim_buf_hdr;
    1226                 :     uint32      victim_buf_state;
    1227                 : 
    1228 EUB             :     /* create a tag so we can lookup the buffer */
    1229 GNC    69248171 :     InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
    1230                 : 
    1231 EUB             :     /* determine its hash code and partition lock ID */
    1232 GIC    69248171 :     newHash = BufTableHashCode(&newTag);
    1233        69248171 :     newPartitionLock = BufMappingPartitionLock(newHash);
    1234                 : 
    1235                 :     /* see if the block is in the buffer pool already */
    1236        69248171 :     LWLockAcquire(newPartitionLock, LW_SHARED);
    1237 GNC    69248171 :     existing_buf_id = BufTableLookup(&newTag, newHash);
    1238        69248171 :     if (existing_buf_id >= 0)
    1239                 :     {
    1240                 :         BufferDesc *buf;
    1241                 :         bool        valid;
    1242                 : 
    1243                 :         /*
    1244                 :          * Found it.  Now, pin the buffer so no one can steal it from the
    1245                 :          * buffer pool, and check to see if the correct data has been loaded
    1246                 :          * into the buffer.
    1247                 :          */
    1248        67747784 :         buf = GetBufferDescriptor(existing_buf_id);
    1249                 : 
    1250 GIC    67747784 :         valid = PinBuffer(buf, strategy);
    1251                 : 
    1252 ECB             :         /* Can release the mapping lock as soon as we've pinned it */
    1253 CBC    67747784 :         LWLockRelease(newPartitionLock);
    1254                 : 
    1255        67747784 :         *foundPtr = true;
    1256                 : 
    1257 GIC    67747784 :         if (!valid)
    1258 ECB             :         {
    1259                 :             /*
    1260                 :              * We can only get here if (a) someone else is still reading in
    1261                 :              * the page, or (b) a previous read attempt failed.  We have to
    1262                 :              * wait for any active read attempt to finish, and then set up our
    1263                 :              * own read attempt if the page is still not BM_VALID.
    1264                 :              * StartBufferIO does it all.
    1265                 :              */
    1266 GIC        1269 :             if (StartBufferIO(buf, true))
    1267                 :             {
    1268                 :                 /*
    1269 ECB             :                  * If we get here, previous attempts to read the buffer must
    1270                 :                  * have failed ... but we shall bravely try again.
    1271                 :                  */
    1272 CBC          19 :                 *foundPtr = false;
    1273 ECB             :             }
    1274                 :         }
    1275                 : 
    1276 GIC    67747784 :         return buf;
    1277                 :     }
    1278                 : 
    1279                 :     /*
    1280                 :      * Didn't find it in the buffer pool.  We'll have to initialize a new
    1281                 :      * buffer.  Remember to unlock the mapping lock while doing the work.
    1282                 :      */
    1283 CBC     1500387 :     LWLockRelease(newPartitionLock);
    1284                 : 
    1285                 :     /*
    1286                 :      * Acquire a victim buffer. Somebody else might try to do the same, we
    1287                 :      * don't hold any conflicting locks. If so we'll have to undo our work
    1288                 :      * later.
    1289                 :      */
    1290 GNC     1500387 :     victim_buffer = GetVictimBuffer(strategy, io_context);
    1291         1500387 :     victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
    1292                 : 
    1293                 :     /*
    1294                 :      * Try to make a hashtable entry for the buffer under its new tag. If
    1295                 :      * somebody else inserted another buffer for the tag, we'll release the
    1296                 :      * victim buffer we acquired and use the already inserted one.
    1297                 :      */
    1298         1500387 :     LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
    1299         1500387 :     existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
    1300         1500387 :     if (existing_buf_id >= 0)
    1301                 :     {
    1302                 :         BufferDesc *existing_buf_hdr;
    1303                 :         bool        valid;
    1304                 : 
    1305                 :         /*
    1306                 :          * Got a collision. Someone has already done what we were about to do.
    1307                 :          * We'll just handle this as if it were found in the buffer pool in
    1308                 :          * the first place.  First, give up the buffer we were planning to
    1309                 :          * use.
    1310                 :          *
    1311                 :          * We could do this after releasing the partition lock, but then we'd
    1312                 :          * have to call ResourceOwnerEnlargeBuffers() &
    1313                 :          * ReservePrivateRefCountEntry() before acquiring the lock, for the
    1314                 :          * rare case of such a collision.
    1315                 :          */
    1316             809 :         UnpinBuffer(victim_buf_hdr);
    1317                 : 
    1318                 :         /*
    1319                 :          * The victim buffer we acquired peviously is clean and unused, let it
    1320                 :          * be found again quickly
    1321                 :          */
    1322             809 :         StrategyFreeBuffer(victim_buf_hdr);
    1323 ECB             : 
    1324                 :         /* remaining code should match code at top of routine */
    1325                 : 
    1326 GNC         809 :         existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
    1327                 : 
    1328             809 :         valid = PinBuffer(existing_buf_hdr, strategy);
    1329                 : 
    1330                 :         /* Can release the mapping lock as soon as we've pinned it */
    1331 GIC         809 :         LWLockRelease(newPartitionLock);
    1332                 : 
    1333 GNC         809 :         *foundPtr = true;
    1334                 : 
    1335             809 :         if (!valid)
    1336                 :         {
    1337                 :             /*
    1338                 :              * We can only get here if (a) someone else is still reading in
    1339                 :              * the page, or (b) a previous read attempt failed.  We have to
    1340                 :              * wait for any active read attempt to finish, and then set up our
    1341                 :              * own read attempt if the page is still not BM_VALID.
    1342                 :              * StartBufferIO does it all.
    1343                 :              */
    1344             509 :             if (StartBufferIO(existing_buf_hdr, true))
    1345                 :             {
    1346                 :                 /*
    1347                 :                  * If we get here, previous attempts to read the buffer must
    1348                 :                  * have failed ... but we shall bravely try again.
    1349                 :                  */
    1350               3 :                 *foundPtr = false;
    1351                 :             }
    1352                 :         }
    1353                 : 
    1354             809 :         return existing_buf_hdr;
    1355                 :     }
    1356                 : 
    1357                 :     /*
    1358                 :      * Need to lock the buffer header too in order to change its tag.
    1359                 :      */
    1360         1499578 :     victim_buf_state = LockBufHdr(victim_buf_hdr);
    1361                 : 
    1362                 :     /* some sanity checks while we hold the buffer header lock */
    1363         1499578 :     Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
    1364         1499578 :     Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
    1365                 : 
    1366         1499578 :     victim_buf_hdr->tag = newTag;
    1367                 : 
    1368                 :     /*
    1369                 :      * Make sure BM_PERMANENT is set for buffers that must be written at every
    1370                 :      * checkpoint.  Unlogged buffers only need to be written at shutdown
    1371                 :      * checkpoints, except for their "init" forks, which need to be treated
    1372                 :      * just like permanent relations.
    1373                 :      */
    1374         1499578 :     victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    1375 CBC     1499578 :     if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
    1376 GNC     1499550 :         victim_buf_state |= BM_PERMANENT;
    1377                 : 
    1378         1499578 :     UnlockBufHdr(victim_buf_hdr, victim_buf_state);
    1379                 : 
    1380 CBC     1499578 :     LWLockRelease(newPartitionLock);
    1381 ECB             : 
    1382                 :     /*
    1383                 :      * Buffer contents are currently invalid.  Try to obtain the right to
    1384                 :      * start I/O.  If StartBufferIO returns false, then someone else managed
    1385                 :      * to read it before we did, so there's nothing left for BufferAlloc() to
    1386                 :      * do.
    1387                 :      */
    1388 GNC     1499578 :     if (StartBufferIO(victim_buf_hdr, true))
    1389 GIC     1499566 :         *foundPtr = false;
    1390                 :     else
    1391              12 :         *foundPtr = true;
    1392                 : 
    1393 GNC     1499578 :     return victim_buf_hdr;
    1394 ECB             : }
    1395                 : 
    1396                 : /*
    1397                 :  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
    1398                 :  * freelist.
    1399                 :  *
    1400 EUB             :  * The buffer header spinlock must be held at entry.  We drop it before
    1401 ECB             :  * returning.  (This is sane because the caller must have locked the
    1402                 :  * buffer in order to be sure it should be dropped.)
    1403                 :  *
    1404                 :  * This is used only in contexts such as dropping a relation.  We assume
    1405                 :  * that no other backend could possibly be interested in using the page,
    1406                 :  * so the only reason the buffer might be pinned is if someone else is
    1407                 :  * trying to write it out.  We have to let them finish before we can
    1408                 :  * reclaim the buffer.
    1409                 :  *
    1410                 :  * The buffer could get reclaimed by someone else while we are waiting
    1411                 :  * to acquire the necessary locks; if so, don't mess it up.
    1412                 :  */
    1413                 : static void
    1414 GIC       83771 : InvalidateBuffer(BufferDesc *buf)
    1415                 : {
    1416                 :     BufferTag   oldTag;
    1417 ECB             :     uint32      oldHash;        /* hash value for oldTag */
    1418                 :     LWLock     *oldPartitionLock;   /* buffer partition lock for it */
    1419                 :     uint32      oldFlags;
    1420                 :     uint32      buf_state;
    1421                 : 
    1422                 :     /* Save the original buffer tag before dropping the spinlock */
    1423 CBC       83771 :     oldTag = buf->tag;
    1424                 : 
    1425 GIC       83771 :     buf_state = pg_atomic_read_u32(&buf->state);
    1426           83771 :     Assert(buf_state & BM_LOCKED);
    1427           83771 :     UnlockBufHdr(buf, buf_state);
    1428 ECB             : 
    1429                 :     /*
    1430                 :      * Need to compute the old tag's hashcode and partition lock ID. XXX is it
    1431                 :      * worth storing the hashcode in BufferDesc so we need not recompute it
    1432                 :      * here?  Probably not.
    1433                 :      */
    1434 GIC       83771 :     oldHash = BufTableHashCode(&oldTag);
    1435           83771 :     oldPartitionLock = BufMappingPartitionLock(oldHash);
    1436                 : 
    1437           83773 : retry:
    1438                 : 
    1439                 :     /*
    1440                 :      * Acquire exclusive mapping lock in preparation for changing the buffer's
    1441 ECB             :      * association.
    1442                 :      */
    1443 GIC       83773 :     LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
    1444                 : 
    1445                 :     /* Re-lock the buffer header */
    1446           83773 :     buf_state = LockBufHdr(buf);
    1447                 : 
    1448 ECB             :     /* If it's changed while we were waiting for lock, do nothing */
    1449 GNC       83773 :     if (!BufferTagsEqual(&buf->tag, &oldTag))
    1450                 :     {
    1451 CBC           2 :         UnlockBufHdr(buf, buf_state);
    1452 GIC           2 :         LWLockRelease(oldPartitionLock);
    1453 CBC           2 :         return;
    1454 ECB             :     }
    1455                 : 
    1456                 :     /*
    1457                 :      * We assume the only reason for it to be pinned is that someone else is
    1458                 :      * flushing the page out.  Wait for them to finish.  (This could be an
    1459                 :      * infinite loop if the refcount is messed up... it would be nice to time
    1460                 :      * out after awhile, but there seems no way to be sure how many loops may
    1461                 :      * be needed.  Note that if the other guy has pinned the buffer but not
    1462                 :      * yet done StartBufferIO, WaitIO will fall through and we'll effectively
    1463                 :      * be busy-looping here.)
    1464                 :      */
    1465 CBC       83771 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
    1466 ECB             :     {
    1467 CBC           2 :         UnlockBufHdr(buf, buf_state);
    1468 GIC           2 :         LWLockRelease(oldPartitionLock);
    1469                 :         /* safety check: should definitely not be our *own* pin */
    1470               2 :         if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
    1471 UIC           0 :             elog(ERROR, "buffer is pinned in InvalidateBuffer");
    1472 GIC           2 :         WaitIO(buf);
    1473 CBC           2 :         goto retry;
    1474                 :     }
    1475 ECB             : 
    1476                 :     /*
    1477                 :      * Clear out the buffer's tag and flags.  We must do this to ensure that
    1478                 :      * linear scans of the buffer array don't think the buffer is valid.
    1479                 :      */
    1480 CBC       83769 :     oldFlags = buf_state & BUF_FLAG_MASK;
    1481 GNC       83769 :     ClearBufferTag(&buf->tag);
    1482 GIC       83769 :     buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
    1483           83769 :     UnlockBufHdr(buf, buf_state);
    1484                 : 
    1485                 :     /*
    1486                 :      * Remove the buffer from the lookup hashtable, if it was in there.
    1487                 :      */
    1488           83769 :     if (oldFlags & BM_TAG_VALID)
    1489           83769 :         BufTableDelete(&oldTag, oldHash);
    1490 ECB             : 
    1491                 :     /*
    1492                 :      * Done with mapping lock.
    1493                 :      */
    1494 CBC       83769 :     LWLockRelease(oldPartitionLock);
    1495                 : 
    1496                 :     /*
    1497 ECB             :      * Insert the buffer at the head of the list of free buffers.
    1498                 :      */
    1499 CBC       83769 :     StrategyFreeBuffer(buf);
    1500                 : }
    1501 ECB             : 
    1502                 : /*
    1503                 :  * Helper routine for GetVictimBuffer()
    1504                 :  *
    1505                 :  * Needs to be called on a buffer with a valid tag, pinned, but without the
    1506                 :  * buffer header spinlock held.
    1507                 :  *
    1508                 :  * Returns true if the buffer can be reused, in which case the buffer is only
    1509                 :  * pinned by this backend and marked as invalid, false otherwise.
    1510                 :  */
    1511                 : static bool
    1512 GNC     1052237 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
    1513                 : {
    1514                 :     uint32      buf_state;
    1515                 :     uint32      hash;
    1516                 :     LWLock     *partition_lock;
    1517                 :     BufferTag   tag;
    1518                 : 
    1519         1052237 :     Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
    1520                 : 
    1521                 :     /* have buffer pinned, so it's safe to read tag without lock */
    1522         1052237 :     tag = buf_hdr->tag;
    1523                 : 
    1524         1052237 :     hash = BufTableHashCode(&tag);
    1525         1052237 :     partition_lock = BufMappingPartitionLock(hash);
    1526                 : 
    1527         1052237 :     LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    1528                 : 
    1529                 :     /* lock the buffer header */
    1530         1052237 :     buf_state = LockBufHdr(buf_hdr);
    1531                 : 
    1532                 :     /*
    1533                 :      * We have the buffer pinned nobody else should have been able to unset
    1534                 :      * this concurrently.
    1535                 :      */
    1536         1052237 :     Assert(buf_state & BM_TAG_VALID);
    1537         1052237 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    1538         1052237 :     Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
    1539                 : 
    1540                 :     /*
    1541                 :      * If somebody else pinned the buffer since, or even worse, dirtied it,
    1542                 :      * give up on this buffer: It's clearly in use.
    1543                 :      */
    1544         1052237 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
    1545                 :     {
    1546             308 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    1547                 : 
    1548             308 :         UnlockBufHdr(buf_hdr, buf_state);
    1549             308 :         LWLockRelease(partition_lock);
    1550                 : 
    1551             308 :         return false;
    1552                 :     }
    1553                 : 
    1554                 :     /*
    1555                 :      * Clear out the buffer's tag and flags and usagecount.  This is not
    1556                 :      * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
    1557                 :      * doing anything with the buffer. But currently it's beneficial, as the
    1558                 :      * cheaper pre-check for several linear scans of shared buffers use the
    1559                 :      * tag (see e.g. FlushDatabaseBuffers()).
    1560                 :      */
    1561         1051929 :     ClearBufferTag(&buf_hdr->tag);
    1562         1051929 :     buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
    1563         1051929 :     UnlockBufHdr(buf_hdr, buf_state);
    1564                 : 
    1565         1051929 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    1566                 : 
    1567                 :     /* finally delete buffer from the buffer mapping table */
    1568         1051929 :     BufTableDelete(&tag, hash);
    1569                 : 
    1570         1051929 :     LWLockRelease(partition_lock);
    1571                 : 
    1572         1051929 :     Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
    1573         1051929 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    1574         1051929 :     Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
    1575                 : 
    1576         1051929 :     return true;
    1577                 : }
    1578                 : 
    1579                 : static Buffer
    1580         1870075 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
    1581                 : {
    1582                 :     BufferDesc *buf_hdr;
    1583                 :     Buffer      buf;
    1584                 :     uint32      buf_state;
    1585                 :     bool        from_ring;
    1586                 : 
    1587                 :     /*
    1588                 :      * Ensure, while the spinlock's not yet held, that there's a free refcount
    1589                 :      * entry.
    1590                 :      */
    1591         1870075 :     ReservePrivateRefCountEntry();
    1592         1870075 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    1593                 : 
    1594                 :     /* we return here if a prospective victim buffer gets used concurrently */
    1595            8428 : again:
    1596                 : 
    1597                 :     /*
    1598                 :      * Select a victim buffer.  The buffer is returned with its header
    1599                 :      * spinlock still held!
    1600                 :      */
    1601         1878503 :     buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
    1602         1878503 :     buf = BufferDescriptorGetBuffer(buf_hdr);
    1603                 : 
    1604         1878503 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
    1605                 : 
    1606                 :     /* Pin the buffer and then release the buffer spinlock */
    1607         1878503 :     PinBuffer_Locked(buf_hdr);
    1608                 : 
    1609                 :     /*
    1610                 :      * We shouldn't have any other pins for this buffer.
    1611                 :      */
    1612         1878503 :     CheckBufferIsPinnedOnce(buf);
    1613                 : 
    1614                 :     /*
    1615                 :      * If the buffer was dirty, try to write it out.  There is a race
    1616                 :      * condition here, in that someone might dirty it after we released the
    1617                 :      * buffer header lock above, or even while we are writing it out (since
    1618                 :      * our share-lock won't prevent hint-bit updates).  We will recheck the
    1619                 :      * dirty bit after re-locking the buffer header.
    1620                 :      */
    1621         1878503 :     if (buf_state & BM_DIRTY)
    1622                 :     {
    1623                 :         LWLock     *content_lock;
    1624                 : 
    1625          242096 :         Assert(buf_state & BM_TAG_VALID);
    1626          242096 :         Assert(buf_state & BM_VALID);
    1627                 : 
    1628                 :         /*
    1629                 :          * We need a share-lock on the buffer contents to write it out (else
    1630                 :          * we might write invalid data, eg because someone else is compacting
    1631                 :          * the page contents while we write).  We must use a conditional lock
    1632                 :          * acquisition here to avoid deadlock.  Even though the buffer was not
    1633                 :          * pinned (and therefore surely not locked) when StrategyGetBuffer
    1634                 :          * returned it, someone else could have pinned and exclusive-locked it
    1635                 :          * by the time we get here. If we try to get the lock unconditionally,
    1636                 :          * we'd block waiting for them; if they later block waiting for us,
    1637                 :          * deadlock ensues. (This has been observed to happen when two
    1638                 :          * backends are both trying to split btree index pages, and the second
    1639                 :          * one just happens to be trying to split the page the first one got
    1640                 :          * from StrategyGetBuffer.)
    1641                 :          */
    1642          242096 :         content_lock = BufferDescriptorGetContentLock(buf_hdr);
    1643          242096 :         if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
    1644                 :         {
    1645                 :             /*
    1646                 :              * Someone else has locked the buffer, so give it up and loop back
    1647                 :              * to get another one.
    1648                 :              */
    1649 UNC           0 :             UnpinBuffer(buf_hdr);
    1650               0 :             goto again;
    1651                 :         }
    1652                 : 
    1653                 :         /*
    1654                 :          * If using a nondefault strategy, and writing the buffer would
    1655                 :          * require a WAL flush, let the strategy decide whether to go ahead
    1656                 :          * and write/reuse the buffer or to choose another victim.  We need a
    1657                 :          * lock to inspect the page LSN, so this can't be done inside
    1658                 :          * StrategyGetBuffer.
    1659                 :          */
    1660 GNC      242096 :         if (strategy != NULL)
    1661                 :         {
    1662                 :             XLogRecPtr  lsn;
    1663                 : 
    1664                 :             /* Read the LSN while holding buffer header lock */
    1665           65674 :             buf_state = LockBufHdr(buf_hdr);
    1666           65674 :             lsn = BufferGetLSN(buf_hdr);
    1667           65674 :             UnlockBufHdr(buf_hdr, buf_state);
    1668                 : 
    1669           65674 :             if (XLogNeedsFlush(lsn)
    1670           10275 :                 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
    1671                 :             {
    1672            8120 :                 LWLockRelease(content_lock);
    1673            8120 :                 UnpinBuffer(buf_hdr);
    1674            8120 :                 goto again;
    1675                 :             }
    1676                 :         }
    1677                 : 
    1678                 :         /* OK, do the I/O */
    1679          233976 :         FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
    1680          233976 :         LWLockRelease(content_lock);
    1681                 : 
    1682          233976 :         ScheduleBufferTagForWriteback(&BackendWritebackContext,
    1683                 :                                       &buf_hdr->tag);
    1684                 :     }
    1685                 : 
    1686                 : 
    1687         1870383 :     if (buf_state & BM_VALID)
    1688                 :     {
    1689                 :         /*
    1690                 :          * When a BufferAccessStrategy is in use, blocks evicted from shared
    1691                 :          * buffers are counted as IOOP_EVICT in the corresponding context
    1692                 :          * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
    1693                 :          * strategy in two cases: 1) while initially claiming buffers for the
    1694                 :          * strategy ring 2) to replace an existing strategy ring buffer
    1695                 :          * because it is pinned or in use and cannot be reused.
    1696                 :          *
    1697                 :          * Blocks evicted from buffers already in the strategy ring are
    1698                 :          * counted as IOOP_REUSE in the corresponding strategy context.
    1699                 :          *
    1700                 :          * At this point, we can accurately count evictions and reuses,
    1701                 :          * because we have successfully claimed the valid buffer. Previously,
    1702                 :          * we may have been forced to release the buffer due to concurrent
    1703                 :          * pinners or erroring out.
    1704                 :          */
    1705         1052237 :         pgstat_count_io_op(IOOBJECT_RELATION, io_context,
    1706         1052237 :                            from_ring ? IOOP_REUSE : IOOP_EVICT);
    1707                 :     }
    1708                 : 
    1709                 :     /*
    1710                 :      * If the buffer has an entry in the buffer mapping table, delete it. This
    1711                 :      * can fail because another backend could have pinned or dirtied the
    1712                 :      * buffer.
    1713                 :      */
    1714         1870383 :     if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
    1715                 :     {
    1716             308 :         UnpinBuffer(buf_hdr);
    1717             308 :         goto again;
    1718                 :     }
    1719                 : 
    1720                 :     /* a final set of sanity checks */
    1721                 : #ifdef USE_ASSERT_CHECKING
    1722         1870075 :     buf_state = pg_atomic_read_u32(&buf_hdr->state);
    1723                 : 
    1724         1870075 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    1725         1870075 :     Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
    1726                 : 
    1727         1870075 :     CheckBufferIsPinnedOnce(buf);
    1728                 : #endif
    1729                 : 
    1730         1870075 :     return buf;
    1731                 : }
    1732                 : 
    1733                 : /*
    1734                 :  * Limit the number of pins a batch operation may additionally acquire, to
    1735                 :  * avoid running out of pinnable buffers.
    1736                 :  *
    1737                 :  * One additional pin is always allowed, as otherwise the operation likely
    1738                 :  * cannot be performed at all.
    1739                 :  *
    1740                 :  * The number of allowed pins for a backend is computed based on
    1741                 :  * shared_buffers and the maximum number of connections possible. That's very
    1742                 :  * pessimistic, but outside of toy-sized shared_buffers it should allow
    1743                 :  * sufficient pins.
    1744                 :  */
    1745                 : static void
    1746          332890 : LimitAdditionalPins(uint32 *additional_pins)
    1747                 : {
    1748                 :     uint32      max_backends;
    1749                 :     int         max_proportional_pins;
    1750                 : 
    1751          332890 :     if (*additional_pins <= 1)
    1752          314544 :         return;
    1753                 : 
    1754           18346 :     max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
    1755           18346 :     max_proportional_pins = NBuffers / max_backends;
    1756                 : 
    1757                 :     /*
    1758                 :      * Subtract the approximate number of buffers already pinned by this
    1759                 :      * backend. We get the number of "overflowed" pins for free, but don't
    1760                 :      * know the number of pins in PrivateRefCountArray. The cost of
    1761                 :      * calculating that exactly doesn't seem worth it, so just assume the max.
    1762                 :      */
    1763           18346 :     max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
    1764                 : 
    1765           18346 :     if (max_proportional_pins < 0)
    1766            3236 :         max_proportional_pins = 1;
    1767                 : 
    1768           18346 :     if (*additional_pins > max_proportional_pins)
    1769            3236 :         *additional_pins = max_proportional_pins;
    1770                 : }
    1771                 : 
    1772                 : /*
    1773                 :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
    1774                 :  * avoid duplicating the tracing and relpersistence related logic.
    1775                 :  */
    1776                 : static BlockNumber
    1777          343785 : ExtendBufferedRelCommon(ExtendBufferedWhat eb,
    1778                 :                         ForkNumber fork,
    1779                 :                         BufferAccessStrategy strategy,
    1780                 :                         uint32 flags,
    1781                 :                         uint32 extend_by,
    1782                 :                         BlockNumber extend_upto,
    1783                 :                         Buffer *buffers,
    1784                 :                         uint32 *extended_by)
    1785                 : {
    1786                 :     BlockNumber first_block;
    1787                 : 
    1788                 :     TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
    1789                 :                                          eb.smgr->smgr_rlocator.locator.spcOid,
    1790                 :                                          eb.smgr->smgr_rlocator.locator.dbOid,
    1791                 :                                          eb.smgr->smgr_rlocator.locator.relNumber,
    1792                 :                                          eb.smgr->smgr_rlocator.backend,
    1793                 :                                          extend_by);
    1794                 : 
    1795          343785 :     if (eb.relpersistence == RELPERSISTENCE_TEMP)
    1796           10895 :         first_block = ExtendBufferedRelLocal(eb, fork, flags,
    1797                 :                                              extend_by, extend_upto,
    1798                 :                                              buffers, &extend_by);
    1799                 :     else
    1800          332890 :         first_block = ExtendBufferedRelShared(eb, fork, strategy, flags,
    1801                 :                                               extend_by, extend_upto,
    1802                 :                                               buffers, &extend_by);
    1803          343785 :     *extended_by = extend_by;
    1804                 : 
    1805                 :     TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
    1806                 :                                         eb.smgr->smgr_rlocator.locator.spcOid,
    1807                 :                                         eb.smgr->smgr_rlocator.locator.dbOid,
    1808                 :                                         eb.smgr->smgr_rlocator.locator.relNumber,
    1809                 :                                         eb.smgr->smgr_rlocator.backend,
    1810                 :                                         *extended_by,
    1811                 :                                         first_block);
    1812                 : 
    1813          343785 :     return first_block;
    1814                 : }
    1815                 : 
    1816                 : /*
    1817                 :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
    1818                 :  * shared buffers.
    1819                 :  */
    1820                 : static BlockNumber
    1821          332890 : ExtendBufferedRelShared(ExtendBufferedWhat eb,
    1822                 :                         ForkNumber fork,
    1823                 :                         BufferAccessStrategy strategy,
    1824                 :                         uint32 flags,
    1825                 :                         uint32 extend_by,
    1826                 :                         BlockNumber extend_upto,
    1827                 :                         Buffer *buffers,
    1828                 :                         uint32 *extended_by)
    1829                 : {
    1830                 :     BlockNumber first_block;
    1831          332890 :     IOContext   io_context = IOContextForStrategy(strategy);
    1832                 :     instr_time  io_start;
    1833                 : 
    1834          332890 :     LimitAdditionalPins(&extend_by);
    1835                 : 
    1836                 :     /*
    1837                 :      * Acquire victim buffers for extension without holding extension lock.
    1838                 :      * Writing out victim buffers is the most expensive part of extending the
    1839                 :      * relation, particularly when doing so requires WAL flushes. Zeroing out
    1840                 :      * the buffers is also quite expensive, so do that before holding the
    1841                 :      * extension lock as well.
    1842                 :      *
    1843                 :      * These pages are pinned by us and not valid. While we hold the pin they
    1844                 :      * can't be acquired as victim buffers by another backend.
    1845                 :      */
    1846          702578 :     for (uint32 i = 0; i < extend_by; i++)
    1847                 :     {
    1848                 :         Block       buf_block;
    1849                 : 
    1850          369688 :         buffers[i] = GetVictimBuffer(strategy, io_context);
    1851          369688 :         buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
    1852                 : 
    1853                 :         /* new buffers are zero-filled */
    1854          369688 :         MemSet((char *) buf_block, 0, BLCKSZ);
    1855                 :     }
    1856                 : 
    1857                 :     /* in case we need to pin an existing buffer below */
    1858          332890 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    1859                 : 
    1860                 :     /*
    1861                 :      * Lock relation against concurrent extensions, unless requested not to.
    1862                 :      *
    1863                 :      * We use the same extension lock for all forks. That's unnecessarily
    1864                 :      * restrictive, but currently extensions for forks don't happen often
    1865                 :      * enough to make it worth locking more granularly.
    1866                 :      *
    1867                 :      * Note that another backend might have extended the relation by the time
    1868                 :      * we get the lock.
    1869                 :      */
    1870          332890 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    1871                 :     {
    1872          289641 :         LockRelationForExtension(eb.rel, ExclusiveLock);
    1873          289641 :         if (eb.rel)
    1874          289641 :             eb.smgr = RelationGetSmgr(eb.rel);
    1875                 :     }
    1876                 : 
    1877                 :     /*
    1878                 :      * If requested, invalidate size cache, so that smgrnblocks asks the
    1879                 :      * kernel.
    1880                 :      */
    1881          332890 :     if (flags & EB_CLEAR_SIZE_CACHE)
    1882           28427 :         eb.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    1883                 : 
    1884          332890 :     first_block = smgrnblocks(eb.smgr, fork);
    1885                 : 
    1886                 :     /*
    1887                 :      * Now that we have the accurate relation size, check if the caller wants
    1888                 :      * us to extend to only up to a specific size. If there were concurrent
    1889                 :      * extensions, we might have acquired too many buffers and need to release
    1890                 :      * them.
    1891                 :      */
    1892          332890 :     if (extend_upto != InvalidBlockNumber)
    1893                 :     {
    1894           66293 :         uint32      orig_extend_by = extend_by;
    1895                 : 
    1896           66293 :         if (first_block > extend_upto)
    1897 UNC           0 :             extend_by = 0;
    1898 GNC       66293 :         else if ((uint64) first_block + extend_by > extend_upto)
    1899 UNC           0 :             extend_by = extend_upto - first_block;
    1900                 : 
    1901 GNC       66293 :         for (uint32 i = extend_by; i < orig_extend_by; i++)
    1902                 :         {
    1903 UNC           0 :             BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
    1904                 : 
    1905                 :             /*
    1906                 :              * The victim buffer we acquired peviously is clean and unused,
    1907                 :              * let it be found again quickly
    1908                 :              */
    1909               0 :             StrategyFreeBuffer(buf_hdr);
    1910               0 :             UnpinBuffer(buf_hdr);
    1911                 :         }
    1912                 : 
    1913 GNC       66293 :         if (extend_by == 0)
    1914                 :         {
    1915 UNC           0 :             if (!(flags & EB_SKIP_EXTENSION_LOCK))
    1916               0 :                 UnlockRelationForExtension(eb.rel, ExclusiveLock);
    1917               0 :             *extended_by = extend_by;
    1918               0 :             return first_block;
    1919                 :         }
    1920                 :     }
    1921                 : 
    1922                 :     /* Fail if relation is already at maximum possible length */
    1923 GNC      332890 :     if ((uint64) first_block + extend_by >= MaxBlockNumber)
    1924 UNC           0 :         ereport(ERROR,
    1925                 :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    1926                 :                  errmsg("cannot extend relation %s beyond %u blocks",
    1927                 :                         relpath(eb.smgr->smgr_rlocator, fork),
    1928                 :                         MaxBlockNumber)));
    1929                 : 
    1930                 :     /*
    1931                 :      * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
    1932                 :      *
    1933                 :      * This needs to happen before we extend the relation, because as soon as
    1934                 :      * we do, other backends can start to read in those pages.
    1935                 :      */
    1936 GNC      702578 :     for (int i = 0; i < extend_by; i++)
    1937                 :     {
    1938          369688 :         Buffer      victim_buf = buffers[i];
    1939          369688 :         BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
    1940                 :         BufferTag   tag;
    1941                 :         uint32      hash;
    1942                 :         LWLock     *partition_lock;
    1943                 :         int         existing_id;
    1944                 : 
    1945          369688 :         InitBufferTag(&tag, &eb.smgr->smgr_rlocator.locator, fork, first_block + i);
    1946          369688 :         hash = BufTableHashCode(&tag);
    1947          369688 :         partition_lock = BufMappingPartitionLock(hash);
    1948                 : 
    1949          369688 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    1950                 : 
    1951          369688 :         existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
    1952                 : 
    1953                 :         /*
    1954                 :          * We get here only in the corner case where we are trying to extend
    1955                 :          * the relation but we found a pre-existing buffer. This can happen
    1956                 :          * because a prior attempt at extending the relation failed, and
    1957                 :          * because mdread doesn't complain about reads beyond EOF (when
    1958                 :          * zero_damaged_pages is ON) and so a previous attempt to read a block
    1959                 :          * beyond EOF could have left a "valid" zero-filled buffer.
    1960                 :          * Unfortunately, we have also seen this case occurring because of
    1961                 :          * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
    1962                 :          * that doesn't account for a recent write. In that situation, the
    1963                 :          * pre-existing buffer would contain valid data that we don't want to
    1964                 :          * overwrite.  Since the legitimate cases should always have left a
    1965                 :          * zero-filled buffer, complain if not PageIsNew.
    1966                 :          */
    1967          369688 :         if (existing_id >= 0)
    1968                 :         {
    1969 UNC           0 :             BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
    1970                 :             Block       buf_block;
    1971                 :             bool        valid;
    1972                 : 
    1973                 :             /*
    1974                 :              * Pin the existing buffer before releasing the partition lock,
    1975                 :              * preventing it from being evicted.
    1976                 :              */
    1977               0 :             valid = PinBuffer(existing_hdr, strategy);
    1978                 : 
    1979               0 :             LWLockRelease(partition_lock);
    1980                 : 
    1981                 :             /*
    1982                 :              * The victim buffer we acquired peviously is clean and unused,
    1983                 :              * let it be found again quickly
    1984                 :              */
    1985               0 :             StrategyFreeBuffer(victim_buf_hdr);
    1986               0 :             UnpinBuffer(victim_buf_hdr);
    1987                 : 
    1988               0 :             buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
    1989               0 :             buf_block = BufHdrGetBlock(existing_hdr);
    1990                 : 
    1991               0 :             if (valid && !PageIsNew((Page) buf_block))
    1992               0 :                 ereport(ERROR,
    1993                 :                         (errmsg("unexpected data beyond EOF in block %u of relation %s",
    1994                 :                                 existing_hdr->tag.blockNum, relpath(eb.smgr->smgr_rlocator, fork)),
    1995                 :                          errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
    1996                 : 
    1997                 :             /*
    1998                 :              * We *must* do smgr[zero]extend before succeeding, else the page
    1999                 :              * will not be reserved by the kernel, and the next P_NEW call
    2000                 :              * will decide to return the same page.  Clear the BM_VALID bit,
    2001                 :              * do StartBufferIO() and proceed.
    2002                 :              *
    2003                 :              * Loop to handle the very small possibility that someone re-sets
    2004                 :              * BM_VALID between our clearing it and StartBufferIO inspecting
    2005                 :              * it.
    2006                 :              */
    2007                 :             do
    2008                 :             {
    2009               0 :                 uint32      buf_state = LockBufHdr(existing_hdr);
    2010                 : 
    2011               0 :                 buf_state &= ~BM_VALID;
    2012               0 :                 UnlockBufHdr(existing_hdr, buf_state);
    2013               0 :             } while (!StartBufferIO(existing_hdr, true));
    2014                 :         }
    2015                 :         else
    2016                 :         {
    2017                 :             uint32      buf_state;
    2018                 : 
    2019 GNC      369688 :             buf_state = LockBufHdr(victim_buf_hdr);
    2020                 : 
    2021                 :             /* some sanity checks while we hold the buffer header lock */
    2022          369688 :             Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
    2023          369688 :             Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2024                 : 
    2025          369688 :             victim_buf_hdr->tag = tag;
    2026                 : 
    2027          369688 :             buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2028          369688 :             if (eb.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
    2029          365635 :                 buf_state |= BM_PERMANENT;
    2030                 : 
    2031          369688 :             UnlockBufHdr(victim_buf_hdr, buf_state);
    2032                 : 
    2033          369688 :             LWLockRelease(partition_lock);
    2034                 : 
    2035                 :             /* XXX: could combine the locked operations in it with the above */
    2036          369688 :             StartBufferIO(victim_buf_hdr, true);
    2037                 :         }
    2038                 :     }
    2039                 : 
    2040          332890 :     io_start = pgstat_prepare_io_time();
    2041                 : 
    2042                 :     /*
    2043                 :      * Note: if smgzerorextend fails, we will end up with buffers that are
    2044                 :      * allocated but not marked BM_VALID.  The next relation extension will
    2045                 :      * still select the same block number (because the relation didn't get any
    2046                 :      * longer on disk) and so future attempts to extend the relation will find
    2047                 :      * the same buffers (if they have not been recycled) but come right back
    2048                 :      * here to try smgrzeroextend again.
    2049                 :      *
    2050                 :      * We don't need to set checksum for all-zero pages.
    2051                 :      */
    2052          332890 :     smgrzeroextend(eb.smgr, fork, first_block, extend_by, false);
    2053                 : 
    2054                 :     /*
    2055                 :      * Release the file-extension lock; it's now OK for someone else to extend
    2056                 :      * the relation some more.
    2057                 :      *
    2058                 :      * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
    2059                 :      * take noticeable time.
    2060                 :      */
    2061          332890 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2062          289641 :         UnlockRelationForExtension(eb.rel, ExclusiveLock);
    2063                 : 
    2064          332890 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
    2065                 :                             io_start, extend_by);
    2066                 : 
    2067                 :     /* Set BM_VALID, terminate IO, and wake up any waiters */
    2068          702578 :     for (int i = 0; i < extend_by; i++)
    2069                 :     {
    2070          369688 :         Buffer      buf = buffers[i];
    2071          369688 :         BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
    2072          369688 :         bool        lock = false;
    2073                 : 
    2074          369688 :         if (flags & EB_LOCK_FIRST && i == 0)
    2075          266410 :             lock = true;
    2076          103278 :         else if (flags & EB_LOCK_TARGET)
    2077                 :         {
    2078           36935 :             Assert(extend_upto != InvalidBlockNumber);
    2079           36935 :             if (first_block + i + 1 == extend_upto)
    2080           36056 :                 lock = true;
    2081                 :         }
    2082                 : 
    2083          369688 :         if (lock)
    2084          302466 :             LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
    2085                 : 
    2086          369688 :         TerminateBufferIO(buf_hdr, false, BM_VALID);
    2087                 :     }
    2088                 : 
    2089          332890 :     pgBufferUsage.shared_blks_written += extend_by;
    2090                 : 
    2091          332890 :     *extended_by = extend_by;
    2092                 : 
    2093          332890 :     return first_block;
    2094                 : }
    2095                 : 
    2096 ECB             : /*
    2097                 :  * MarkBufferDirty
    2098                 :  *
    2099                 :  *      Marks buffer contents as dirty (actual write happens later).
    2100                 :  *
    2101                 :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
    2102                 :  * exclusive lock, then somebody could be in process of writing the buffer,
    2103                 :  * leading to risk of bad data written to disk.)
    2104                 :  */
    2105                 : void
    2106 GIC    29096899 : MarkBufferDirty(Buffer buffer)
    2107                 : {
    2108                 :     BufferDesc *bufHdr;
    2109                 :     uint32      buf_state;
    2110                 :     uint32      old_buf_state;
    2111                 : 
    2112        29096899 :     if (!BufferIsValid(buffer))
    2113 UIC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    2114 ECB             : 
    2115 CBC    29096899 :     if (BufferIsLocal(buffer))
    2116                 :     {
    2117 GIC     1138590 :         MarkLocalBufferDirty(buffer);
    2118 CBC     1138590 :         return;
    2119                 :     }
    2120                 : 
    2121 GIC    27958309 :     bufHdr = GetBufferDescriptor(buffer - 1);
    2122                 : 
    2123        27958309 :     Assert(BufferIsPinned(buffer));
    2124 CBC    27958309 :     Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
    2125 ECB             :                                 LW_EXCLUSIVE));
    2126                 : 
    2127 CBC    27958309 :     old_buf_state = pg_atomic_read_u32(&bufHdr->state);
    2128                 :     for (;;)
    2129                 :     {
    2130        27958389 :         if (old_buf_state & BM_LOCKED)
    2131 GIC          65 :             old_buf_state = WaitBufHdrUnlocked(bufHdr);
    2132                 : 
    2133        27958389 :         buf_state = old_buf_state;
    2134                 : 
    2135 CBC    27958389 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2136 GIC    27958389 :         buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
    2137                 : 
    2138        27958389 :         if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
    2139                 :                                            buf_state))
    2140        27958309 :             break;
    2141                 :     }
    2142                 : 
    2143                 :     /*
    2144 ECB             :      * If the buffer was not dirty already, do vacuum accounting.
    2145                 :      */
    2146 GIC    27958309 :     if (!(old_buf_state & BM_DIRTY))
    2147                 :     {
    2148 CBC      743896 :         VacuumPageDirty++;
    2149          743896 :         pgBufferUsage.shared_blks_dirtied++;
    2150 GIC      743896 :         if (VacuumCostActive)
    2151            1244 :             VacuumCostBalance += VacuumCostPageDirty;
    2152                 :     }
    2153                 : }
    2154                 : 
    2155                 : /*
    2156                 :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
    2157                 :  *
    2158                 :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
    2159                 :  * compared to calling the two routines separately.  Now it's mainly just
    2160                 :  * a convenience function.  However, if the passed buffer is valid and
    2161                 :  * already contains the desired block, we just return it as-is; and that
    2162                 :  * does save considerable work compared to a full release and reacquire.
    2163                 :  *
    2164                 :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
    2165 ECB             :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
    2166                 :  * but can save some tests in the caller.
    2167                 :  */
    2168                 : Buffer
    2169 GIC    33565066 : ReleaseAndReadBuffer(Buffer buffer,
    2170                 :                      Relation relation,
    2171                 :                      BlockNumber blockNum)
    2172 EUB             : {
    2173 GBC    33565066 :     ForkNumber  forkNum = MAIN_FORKNUM;
    2174                 :     BufferDesc *bufHdr;
    2175                 : 
    2176 GIC    33565066 :     if (BufferIsValid(buffer))
    2177                 :     {
    2178        22743215 :         Assert(BufferIsPinned(buffer));
    2179        22743215 :         if (BufferIsLocal(buffer))
    2180                 :         {
    2181          105134 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    2182          107078 :             if (bufHdr->tag.blockNum == blockNum &&
    2183 GNC        3888 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    2184            1944 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    2185 GIC        1944 :                 return buffer;
    2186 GNC      103190 :             UnpinLocalBuffer(buffer);
    2187 ECB             :         }
    2188                 :         else
    2189                 :         {
    2190 GIC    22638081 :             bufHdr = GetBufferDescriptor(buffer - 1);
    2191 ECB             :             /* we have pin, so it's ok to examine tag without spinlock */
    2192 CBC    30582933 :             if (bufHdr->tag.blockNum == blockNum &&
    2193 GNC    15889704 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    2194         7944852 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    2195 CBC     7944852 :                 return buffer;
    2196 GNC    14693229 :             UnpinBuffer(bufHdr);
    2197                 :         }
    2198                 :     }
    2199                 : 
    2200 GIC    25618270 :     return ReadBuffer(relation, blockNum);
    2201 ECB             : }
    2202                 : 
    2203                 : /*
    2204                 :  * PinBuffer -- make buffer unavailable for replacement.
    2205                 :  *
    2206                 :  * For the default access strategy, the buffer's usage_count is incremented
    2207                 :  * when we first pin it; for other strategies we just make sure the usage_count
    2208                 :  * isn't zero.  (The idea of the latter is that we don't want synchronized
    2209                 :  * heap scans to inflate the count, but we need it to not be zero to discourage
    2210                 :  * other backends from stealing buffers from our ring.  As long as we cycle
    2211                 :  * through the ring faster than the global clock-sweep cycles, buffers in
    2212                 :  * our ring won't be chosen as victims for replacement by other backends.)
    2213                 :  *
    2214                 :  * This should be applied only to shared buffers, never local ones.
    2215                 :  *
    2216                 :  * Since buffers are pinned/unpinned very frequently, pin buffers without
    2217                 :  * taking the buffer header lock; instead update the state variable in loop of
    2218                 :  * CAS operations. Hopefully it's just a single CAS.
    2219                 :  *
    2220                 :  * Note that ResourceOwnerEnlargeBuffers must have been done already.
    2221                 :  *
    2222                 :  * Returns true if buffer is BM_VALID, else false.  This provision allows
    2223                 :  * some callers to avoid an extra spinlock cycle.
    2224                 :  */
    2225                 : static bool
    2226 GIC    67748593 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
    2227 ECB             : {
    2228 CBC    67748593 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    2229                 :     bool        result;
    2230                 :     PrivateRefCountEntry *ref;
    2231                 : 
    2232 GNC    67748593 :     Assert(!BufferIsLocal(b));
    2233                 : 
    2234 GIC    67748593 :     ref = GetPrivateRefCountEntry(b, true);
    2235                 : 
    2236        67748593 :     if (ref == NULL)
    2237                 :     {
    2238 ECB             :         uint32      buf_state;
    2239                 :         uint32      old_buf_state;
    2240                 : 
    2241 CBC    65679782 :         ReservePrivateRefCountEntry();
    2242 GIC    65679782 :         ref = NewPrivateRefCountEntry(b);
    2243                 : 
    2244        65679782 :         old_buf_state = pg_atomic_read_u32(&buf->state);
    2245                 :         for (;;)
    2246 ECB             :         {
    2247 GIC    65691662 :             if (old_buf_state & BM_LOCKED)
    2248 CBC         503 :                 old_buf_state = WaitBufHdrUnlocked(buf);
    2249 ECB             : 
    2250 GIC    65691662 :             buf_state = old_buf_state;
    2251 ECB             : 
    2252                 :             /* increase refcount */
    2253 GIC    65691662 :             buf_state += BUF_REFCOUNT_ONE;
    2254 ECB             : 
    2255 GIC    65691662 :             if (strategy == NULL)
    2256                 :             {
    2257                 :                 /* Default case: increase usagecount unless already max. */
    2258        65132340 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
    2259         3299577 :                     buf_state += BUF_USAGECOUNT_ONE;
    2260                 :             }
    2261                 :             else
    2262                 :             {
    2263                 :                 /*
    2264                 :                  * Ring buffers shouldn't evict others from pool.  Thus we
    2265                 :                  * don't make usagecount more than 1.
    2266                 :                  */
    2267          559322 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    2268           24540 :                     buf_state += BUF_USAGECOUNT_ONE;
    2269                 :             }
    2270 ECB             : 
    2271 GIC    65691662 :             if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
    2272                 :                                                buf_state))
    2273                 :             {
    2274        65679782 :                 result = (buf_state & BM_VALID) != 0;
    2275 ECB             : 
    2276                 :                 /*
    2277                 :                  * Assume that we acquired a buffer pin for the purposes of
    2278                 :                  * Valgrind buffer client checks (even in !result case) to
    2279                 :                  * keep things simple.  Buffers that are unsafe to access are
    2280                 :                  * not generally guaranteed to be marked undefined or
    2281                 :                  * non-accessible in any case.
    2282                 :                  */
    2283                 :                 VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
    2284 GIC    65679782 :                 break;
    2285                 :             }
    2286                 :         }
    2287 ECB             :     }
    2288                 :     else
    2289                 :     {
    2290                 :         /*
    2291                 :          * If we previously pinned the buffer, it must surely be valid.
    2292                 :          *
    2293                 :          * Note: We deliberately avoid a Valgrind client request here.
    2294                 :          * Individual access methods can optionally superimpose buffer page
    2295                 :          * client requests on top of our client requests to enforce that
    2296                 :          * buffers are only accessed while locked (and pinned).  It's possible
    2297                 :          * that the buffer page is legitimately non-accessible here.  We
    2298                 :          * cannot meddle with that.
    2299                 :          */
    2300 GIC     2068811 :         result = true;
    2301 ECB             :     }
    2302                 : 
    2303 GIC    67748593 :     ref->refcount++;
    2304        67748593 :     Assert(ref->refcount > 0);
    2305        67748593 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
    2306        67748593 :     return result;
    2307                 : }
    2308                 : 
    2309                 : /*
    2310                 :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
    2311                 :  * The spinlock is released before return.
    2312                 :  *
    2313                 :  * As this function is called with the spinlock held, the caller has to
    2314                 :  * previously call ReservePrivateRefCountEntry().
    2315                 :  *
    2316                 :  * Currently, no callers of this function want to modify the buffer's
    2317                 :  * usage_count at all, so there's no need for a strategy parameter.
    2318                 :  * Also we don't bother with a BM_VALID test (the caller could check that for
    2319 ECB             :  * itself).
    2320                 :  *
    2321                 :  * Also all callers only ever use this function when it's known that the
    2322                 :  * buffer can't have a preexisting pin by this backend. That allows us to skip
    2323                 :  * searching the private refcount array & hash, which is a boon, because the
    2324                 :  * spinlock is still held.
    2325                 :  *
    2326                 :  * Note: use of this routine is frequently mandatory, not just an optimization
    2327                 :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
    2328                 :  * its state can change under us.
    2329                 :  */
    2330                 : static void
    2331 GIC     2792328 : PinBuffer_Locked(BufferDesc *buf)
    2332                 : {
    2333                 :     Buffer      b;
    2334                 :     PrivateRefCountEntry *ref;
    2335                 :     uint32      buf_state;
    2336                 : 
    2337 ECB             :     /*
    2338                 :      * As explained, We don't expect any preexisting pins. That allows us to
    2339                 :      * manipulate the PrivateRefCount after releasing the spinlock
    2340                 :      */
    2341 GIC     2792328 :     Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
    2342                 : 
    2343                 :     /*
    2344                 :      * Buffer can't have a preexisting pin, so mark its page as defined to
    2345 ECB             :      * Valgrind (this is similar to the PinBuffer() case where the backend
    2346                 :      * doesn't already have a buffer pin)
    2347                 :      */
    2348                 :     VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
    2349                 : 
    2350                 :     /*
    2351                 :      * Since we hold the buffer spinlock, we can update the buffer state and
    2352                 :      * release the lock in one operation.
    2353                 :      */
    2354 GIC     2792328 :     buf_state = pg_atomic_read_u32(&buf->state);
    2355 CBC     2792328 :     Assert(buf_state & BM_LOCKED);
    2356 GIC     2792328 :     buf_state += BUF_REFCOUNT_ONE;
    2357         2792328 :     UnlockBufHdr(buf, buf_state);
    2358 ECB             : 
    2359 GIC     2792328 :     b = BufferDescriptorGetBuffer(buf);
    2360                 : 
    2361         2792328 :     ref = NewPrivateRefCountEntry(b);
    2362         2792328 :     ref->refcount++;
    2363                 : 
    2364         2792328 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
    2365         2792328 : }
    2366                 : 
    2367                 : /*
    2368                 :  * UnpinBuffer -- make buffer available for replacement.
    2369                 :  *
    2370                 :  * This should be applied only to shared buffers, never local ones.  This
    2371                 :  * always adjusts CurrentResourceOwner.
    2372 ECB             :  */
    2373                 : static void
    2374 GNC    81367861 : UnpinBuffer(BufferDesc *buf)
    2375                 : {
    2376 ECB             :     PrivateRefCountEntry *ref;
    2377 GIC    81367861 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    2378                 : 
    2379 GNC    81367861 :     Assert(!BufferIsLocal(b));
    2380                 : 
    2381                 :     /* not moving as we're likely deleting it soon anyway */
    2382 CBC    81367861 :     ref = GetPrivateRefCountEntry(b, false);
    2383 GIC    81367861 :     Assert(ref != NULL);
    2384                 : 
    2385 GNC    81367861 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
    2386                 : 
    2387 GIC    81367861 :     Assert(ref->refcount > 0);
    2388        81367861 :     ref->refcount--;
    2389        81367861 :     if (ref->refcount == 0)
    2390                 :     {
    2391                 :         uint32      buf_state;
    2392                 :         uint32      old_buf_state;
    2393 ECB             : 
    2394                 :         /*
    2395                 :          * Mark buffer non-accessible to Valgrind.
    2396                 :          *
    2397                 :          * Note that the buffer may have already been marked non-accessible
    2398                 :          * within access method code that enforces that buffers are only
    2399                 :          * accessed while a buffer lock is held.
    2400                 :          */
    2401                 :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
    2402                 : 
    2403                 :         /* I'd better not still hold the buffer content lock */
    2404 CBC    68472110 :         Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
    2405 ECB             : 
    2406                 :         /*
    2407                 :          * Decrement the shared reference count.
    2408                 :          *
    2409                 :          * Since buffer spinlock holder can update status using just write,
    2410                 :          * it's not safe to use atomic decrement here; thus use a CAS loop.
    2411                 :          */
    2412 GIC    68472110 :         old_buf_state = pg_atomic_read_u32(&buf->state);
    2413                 :         for (;;)
    2414                 :         {
    2415 CBC    68491196 :             if (old_buf_state & BM_LOCKED)
    2416 GIC         465 :                 old_buf_state = WaitBufHdrUnlocked(buf);
    2417 ECB             : 
    2418 GIC    68491196 :             buf_state = old_buf_state;
    2419 ECB             : 
    2420 GBC    68491196 :             buf_state -= BUF_REFCOUNT_ONE;
    2421 ECB             : 
    2422 GBC    68491196 :             if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
    2423                 :                                                buf_state))
    2424 CBC    68472110 :                 break;
    2425                 :         }
    2426 EUB             : 
    2427                 :         /* Support LockBufferForCleanup() */
    2428 GIC    68472110 :         if (buf_state & BM_PIN_COUNT_WAITER)
    2429                 :         {
    2430                 :             /*
    2431                 :              * Acquire the buffer header lock, re-check that there's a waiter.
    2432 EUB             :              * Another backend could have unpinned this buffer, and already
    2433                 :              * woken up the waiter.  There's no danger of the buffer being
    2434                 :              * replaced after we unpinned it above, as it's pinned by the
    2435                 :              * waiter.
    2436 ECB             :              */
    2437 GIC           2 :             buf_state = LockBufHdr(buf);
    2438 EUB             : 
    2439 GBC           2 :             if ((buf_state & BM_PIN_COUNT_WAITER) &&
    2440               2 :                 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    2441               2 :             {
    2442                 :                 /* we just released the last pin other than the waiter's */
    2443 GIC           2 :                 int         wait_backend_pgprocno = buf->wait_backend_pgprocno;
    2444                 : 
    2445               2 :                 buf_state &= ~BM_PIN_COUNT_WAITER;
    2446 CBC           2 :                 UnlockBufHdr(buf, buf_state);
    2447 GBC           2 :                 ProcSendSignal(wait_backend_pgprocno);
    2448                 :             }
    2449                 :             else
    2450 UIC           0 :                 UnlockBufHdr(buf, buf_state);
    2451                 :         }
    2452 GIC    68472110 :         ForgetPrivateRefCountEntry(ref);
    2453                 :     }
    2454        81367861 : }
    2455                 : 
    2456                 : #define ST_SORT sort_checkpoint_bufferids
    2457                 : #define ST_ELEMENT_TYPE CkptSortItem
    2458                 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
    2459 ECB             : #define ST_SCOPE static
    2460                 : #define ST_DEFINE
    2461                 : #include <lib/sort_template.h>
    2462                 : 
    2463                 : /*
    2464                 :  * BufferSync -- Write out all dirty buffers in the pool.
    2465                 :  *
    2466                 :  * This is called at checkpoint time to write out all dirty shared buffers.
    2467                 :  * The checkpoint request flags should be passed in.  If CHECKPOINT_IMMEDIATE
    2468                 :  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
    2469                 :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
    2470                 :  * unlogged buffers, which are otherwise skipped.  The remaining flags
    2471                 :  * currently have no effect here.
    2472                 :  */
    2473                 : static void
    2474 CBC        2363 : BufferSync(int flags)
    2475                 : {
    2476                 :     uint32      buf_state;
    2477                 :     int         buf_id;
    2478                 :     int         num_to_scan;
    2479                 :     int         num_spaces;
    2480                 :     int         num_processed;
    2481                 :     int         num_written;
    2482 GIC        2363 :     CkptTsStatus *per_ts_stat = NULL;
    2483                 :     Oid         last_tsid;
    2484                 :     binaryheap *ts_heap;
    2485                 :     int         i;
    2486            2363 :     int         mask = BM_DIRTY;
    2487                 :     WritebackContext wb_context;
    2488                 : 
    2489                 :     /* Make sure we can handle the pin inside SyncOneBuffer */
    2490 CBC        2363 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    2491                 : 
    2492 EUB             :     /*
    2493                 :      * Unless this is a shutdown checkpoint or we have been explicitly told,
    2494                 :      * we write only permanent, dirty buffers.  But at shutdown or end of
    2495                 :      * recovery, we write all dirty buffers.
    2496                 :      */
    2497 GIC        2363 :     if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    2498                 :                     CHECKPOINT_FLUSH_ALL))))
    2499             774 :         mask |= BM_PERMANENT;
    2500 EUB             : 
    2501                 :     /*
    2502                 :      * Loop over all buffers, and mark the ones that need to be written with
    2503                 :      * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
    2504                 :      * can estimate how much work needs to be done.
    2505                 :      *
    2506                 :      * This allows us to write only those pages that were dirty when the
    2507                 :      * checkpoint began, and not those that get dirtied while it proceeds.
    2508                 :      * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
    2509                 :      * later in this function, or by normal backends or the bgwriter cleaning
    2510                 :      * scan, the flag is cleared.  Any buffer dirtied after this point won't
    2511                 :      * have the flag set.
    2512                 :      *
    2513                 :      * Note that if we fail to write some buffer, we may leave buffers with
    2514                 :      * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
    2515                 :      * certainly need to be written for the next checkpoint attempt, too.
    2516                 :      */
    2517 GIC        2363 :     num_to_scan = 0;
    2518        34165771 :     for (buf_id = 0; buf_id < NBuffers; buf_id++)
    2519                 :     {
    2520        34163408 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    2521                 : 
    2522                 :         /*
    2523                 :          * Header spinlock is enough to examine BM_DIRTY, see comment in
    2524                 :          * SyncOneBuffer.
    2525                 :          */
    2526        34163408 :         buf_state = LockBufHdr(bufHdr);
    2527                 : 
    2528        34163408 :         if ((buf_state & mask) == mask)
    2529                 :         {
    2530                 :             CkptSortItem *item;
    2531                 : 
    2532 GBC      477090 :             buf_state |= BM_CHECKPOINT_NEEDED;
    2533                 : 
    2534          477090 :             item = &CkptBufferIds[num_to_scan++];
    2535          477090 :             item->buf_id = buf_id;
    2536 GNC      477090 :             item->tsId = bufHdr->tag.spcOid;
    2537          477090 :             item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
    2538          477090 :             item->forkNum = BufTagGetForkNum(&bufHdr->tag);
    2539 GIC      477090 :             item->blockNum = bufHdr->tag.blockNum;
    2540                 :         }
    2541                 : 
    2542 CBC    34163408 :         UnlockBufHdr(bufHdr, buf_state);
    2543                 : 
    2544                 :         /* Check for barrier events in case NBuffers is large. */
    2545        34163408 :         if (ProcSignalBarrierPending)
    2546 LBC           0 :             ProcessProcSignalBarrier();
    2547                 :     }
    2548 ECB             : 
    2549 GIC        2363 :     if (num_to_scan == 0)
    2550 CBC         767 :         return;                 /* nothing to do */
    2551 ECB             : 
    2552 CBC        1596 :     WritebackContextInit(&wb_context, &checkpoint_flush_after);
    2553                 : 
    2554 ECB             :     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
    2555                 : 
    2556                 :     /*
    2557                 :      * Sort buffers that need to be written to reduce the likelihood of random
    2558                 :      * IO. The sorting is also important for the implementation of balancing
    2559                 :      * writes between tablespaces. Without balancing writes we'd potentially
    2560                 :      * end up writing to the tablespaces one-by-one; possibly overloading the
    2561                 :      * underlying system.
    2562                 :      */
    2563 CBC        1596 :     sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
    2564                 : 
    2565 GIC        1596 :     num_spaces = 0;
    2566                 : 
    2567                 :     /*
    2568                 :      * Allocate progress status for each tablespace with buffers that need to
    2569                 :      * be flushed. This requires the to-be-flushed array to be sorted.
    2570                 :      */
    2571            1596 :     last_tsid = InvalidOid;
    2572          478686 :     for (i = 0; i < num_to_scan; i++)
    2573                 :     {
    2574                 :         CkptTsStatus *s;
    2575 ECB             :         Oid         cur_tsid;
    2576                 : 
    2577 GIC      477090 :         cur_tsid = CkptBufferIds[i].tsId;
    2578                 : 
    2579                 :         /*
    2580                 :          * Grow array of per-tablespace status structs, every time a new
    2581                 :          * tablespace is found.
    2582                 :          */
    2583          477090 :         if (last_tsid == InvalidOid || last_tsid != cur_tsid)
    2584 CBC        2677 :         {
    2585 ECB             :             Size        sz;
    2586                 : 
    2587 CBC        2677 :             num_spaces++;
    2588                 : 
    2589                 :             /*
    2590                 :              * Not worth adding grow-by-power-of-2 logic here - even with a
    2591 ECB             :              * few hundred tablespaces this should be fine.
    2592                 :              */
    2593 CBC        2677 :             sz = sizeof(CkptTsStatus) * num_spaces;
    2594 ECB             : 
    2595 CBC        2677 :             if (per_ts_stat == NULL)
    2596 GIC        1596 :                 per_ts_stat = (CkptTsStatus *) palloc(sz);
    2597 ECB             :             else
    2598 CBC        1081 :                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
    2599 ECB             : 
    2600 GIC        2677 :             s = &per_ts_stat[num_spaces - 1];
    2601 CBC        2677 :             memset(s, 0, sizeof(*s));
    2602            2677 :             s->tsId = cur_tsid;
    2603 ECB             : 
    2604                 :             /*
    2605                 :              * The first buffer in this tablespace. As CkptBufferIds is sorted
    2606                 :              * by tablespace all (s->num_to_scan) buffers in this tablespace
    2607                 :              * will follow afterwards.
    2608                 :              */
    2609 CBC        2677 :             s->index = i;
    2610                 : 
    2611                 :             /*
    2612 ECB             :              * progress_slice will be determined once we know how many buffers
    2613                 :              * are in each tablespace, i.e. after this loop.
    2614                 :              */
    2615                 : 
    2616 CBC        2677 :             last_tsid = cur_tsid;
    2617                 :         }
    2618                 :         else
    2619                 :         {
    2620 GIC      474413 :             s = &per_ts_stat[num_spaces - 1];
    2621                 :         }
    2622                 : 
    2623          477090 :         s->num_to_scan++;
    2624                 : 
    2625                 :         /* Check for barrier events. */
    2626          477090 :         if (ProcSignalBarrierPending)
    2627 UIC           0 :             ProcessProcSignalBarrier();
    2628                 :     }
    2629 ECB             : 
    2630 GIC        1596 :     Assert(num_spaces > 0);
    2631                 : 
    2632                 :     /*
    2633                 :      * Build a min-heap over the write-progress in the individual tablespaces,
    2634                 :      * and compute how large a portion of the total progress a single
    2635 ECB             :      * processed buffer is.
    2636 EUB             :      */
    2637 GIC        1596 :     ts_heap = binaryheap_allocate(num_spaces,
    2638 ECB             :                                   ts_ckpt_progress_comparator,
    2639                 :                                   NULL);
    2640                 : 
    2641 CBC        4273 :     for (i = 0; i < num_spaces; i++)
    2642                 :     {
    2643 GIC        2677 :         CkptTsStatus *ts_stat = &per_ts_stat[i];
    2644 ECB             : 
    2645 GIC        2677 :         ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
    2646 ECB             : 
    2647 CBC        2677 :         binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
    2648                 :     }
    2649                 : 
    2650            1596 :     binaryheap_build(ts_heap);
    2651                 : 
    2652                 :     /*
    2653 ECB             :      * Iterate through to-be-checkpointed buffers and write the ones (still)
    2654                 :      * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
    2655                 :      * tablespaces; otherwise the sorting would lead to only one tablespace
    2656                 :      * receiving writes at a time, making inefficient use of the hardware.
    2657                 :      */
    2658 CBC        1596 :     num_processed = 0;
    2659            1596 :     num_written = 0;
    2660 GIC      478686 :     while (!binaryheap_empty(ts_heap))
    2661 ECB             :     {
    2662 GIC      477090 :         BufferDesc *bufHdr = NULL;
    2663 ECB             :         CkptTsStatus *ts_stat = (CkptTsStatus *)
    2664 GIC      477090 :         DatumGetPointer(binaryheap_first(ts_heap));
    2665                 : 
    2666          477090 :         buf_id = CkptBufferIds[ts_stat->index].buf_id;
    2667          477090 :         Assert(buf_id != -1);
    2668                 : 
    2669 CBC      477090 :         bufHdr = GetBufferDescriptor(buf_id);
    2670                 : 
    2671          477090 :         num_processed++;
    2672 ECB             : 
    2673                 :         /*
    2674                 :          * We don't need to acquire the lock here, because we're only looking
    2675                 :          * at a single bit. It's possible that someone else writes the buffer
    2676                 :          * and clears the flag right after we check, but that doesn't matter
    2677                 :          * since SyncOneBuffer will then do nothing.  However, there is a
    2678                 :          * further race condition: it's conceivable that between the time we
    2679                 :          * examine the bit here and the time SyncOneBuffer acquires the lock,
    2680                 :          * someone else not only wrote the buffer but replaced it with another
    2681                 :          * page and dirtied it.  In that improbable case, SyncOneBuffer will
    2682                 :          * write the buffer though we didn't need to.  It doesn't seem worth
    2683                 :          * guarding against this, though.
    2684                 :          */
    2685 GIC      477090 :         if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
    2686                 :         {
    2687          474825 :             if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
    2688                 :             {
    2689                 :                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
    2690          474825 :                 PendingCheckpointerStats.buf_written_checkpoints++;
    2691          474825 :                 num_written++;
    2692 ECB             :             }
    2693                 :         }
    2694                 : 
    2695                 :         /*
    2696                 :          * Measure progress independent of actually having to flush the buffer
    2697                 :          * - otherwise writing become unbalanced.
    2698                 :          */
    2699 CBC      477090 :         ts_stat->progress += ts_stat->progress_slice;
    2700 GIC      477090 :         ts_stat->num_scanned++;
    2701 CBC      477090 :         ts_stat->index++;
    2702 ECB             : 
    2703                 :         /* Have all the buffers from the tablespace been processed? */
    2704 CBC      477090 :         if (ts_stat->num_scanned == ts_stat->num_to_scan)
    2705 ECB             :         {
    2706 CBC        2677 :             binaryheap_remove_first(ts_heap);
    2707 ECB             :         }
    2708                 :         else
    2709                 :         {
    2710                 :             /* update heap with the new progress */
    2711 GIC      474413 :             binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
    2712                 :         }
    2713 ECB             : 
    2714                 :         /*
    2715                 :          * Sleep to throttle our I/O rate.
    2716                 :          *
    2717                 :          * (This will check for barrier events even if it doesn't sleep.)
    2718                 :          */
    2719 CBC      477090 :         CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    2720                 :     }
    2721                 : 
    2722                 :     /* issue all pending flushes */
    2723            1596 :     IssuePendingWritebacks(&wb_context);
    2724                 : 
    2725 GIC        1596 :     pfree(per_ts_stat);
    2726            1596 :     per_ts_stat = NULL;
    2727            1596 :     binaryheap_free(ts_heap);
    2728                 : 
    2729                 :     /*
    2730                 :      * Update checkpoint statistics. As noted above, this doesn't include
    2731                 :      * buffers written by other backends or bgwriter scan.
    2732                 :      */
    2733            1596 :     CheckpointStats.ckpt_bufs_written += num_written;
    2734                 : 
    2735                 :     TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
    2736                 : }
    2737                 : 
    2738                 : /*
    2739                 :  * BgBufferSync -- Write out some dirty buffers in the pool.
    2740                 :  *
    2741                 :  * This is called periodically by the background writer process.
    2742                 :  *
    2743                 :  * Returns true if it's appropriate for the bgwriter process to go into
    2744                 :  * low-power hibernation mode.  (This happens if the strategy clock sweep
    2745                 :  * has been "lapped" and no buffer allocations have occurred recently,
    2746                 :  * or if the bgwriter has been effectively disabled by setting
    2747                 :  * bgwriter_lru_maxpages to 0.)
    2748                 :  */
    2749 ECB             : bool
    2750 GIC       10438 : BgBufferSync(WritebackContext *wb_context)
    2751 ECB             : {
    2752                 :     /* info obtained from freelist.c */
    2753                 :     int         strategy_buf_id;
    2754                 :     uint32      strategy_passes;
    2755                 :     uint32      recent_alloc;
    2756                 : 
    2757                 :     /*
    2758                 :      * Information saved between calls so we can determine the strategy
    2759                 :      * point's advance rate and avoid scanning already-cleaned buffers.
    2760                 :      */
    2761                 :     static bool saved_info_valid = false;
    2762                 :     static int  prev_strategy_buf_id;
    2763                 :     static uint32 prev_strategy_passes;
    2764                 :     static int  next_to_clean;
    2765                 :     static uint32 next_passes;
    2766                 : 
    2767                 :     /* Moving averages of allocation rate and clean-buffer density */
    2768                 :     static float smoothed_alloc = 0;
    2769                 :     static float smoothed_density = 10.0;
    2770                 : 
    2771                 :     /* Potentially these could be tunables, but for now, not */
    2772 GIC       10438 :     float       smoothing_samples = 16;
    2773 CBC       10438 :     float       scan_whole_pool_milliseconds = 120000.0;
    2774                 : 
    2775                 :     /* Used to compute how far we scan ahead */
    2776 ECB             :     long        strategy_delta;
    2777                 :     int         bufs_to_lap;
    2778                 :     int         bufs_ahead;
    2779                 :     float       scans_per_alloc;
    2780                 :     int         reusable_buffers_est;
    2781                 :     int         upcoming_alloc_est;
    2782                 :     int         min_scan_buffers;
    2783                 : 
    2784                 :     /* Variables for the scanning loop proper */
    2785                 :     int         num_to_scan;
    2786                 :     int         num_written;
    2787                 :     int         reusable_buffers;
    2788                 : 
    2789                 :     /* Variables for final smoothed_density update */
    2790                 :     long        new_strategy_delta;
    2791                 :     uint32      new_recent_alloc;
    2792                 : 
    2793                 :     /*
    2794                 :      * Find out where the freelist clock sweep currently is, and how many
    2795                 :      * buffer allocations have happened since our last call.
    2796                 :      */
    2797 CBC       10438 :     strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
    2798                 : 
    2799                 :     /* Report buffer alloc counts to pgstat */
    2800 GIC       10438 :     PendingBgWriterStats.buf_alloc += recent_alloc;
    2801                 : 
    2802                 :     /*
    2803                 :      * If we're not running the LRU scan, just stop after doing the stats
    2804                 :      * stuff.  We mark the saved state invalid so that we can recover sanely
    2805                 :      * if LRU scan is turned back on later.
    2806                 :      */
    2807 CBC       10438 :     if (bgwriter_lru_maxpages <= 0)
    2808                 :     {
    2809 UIC           0 :         saved_info_valid = false;
    2810               0 :         return true;
    2811                 :     }
    2812                 : 
    2813                 :     /*
    2814                 :      * Compute strategy_delta = how many buffers have been scanned by the
    2815                 :      * clock sweep since last time.  If first time through, assume none. Then
    2816                 :      * see if we are still ahead of the clock sweep, and if so, how many
    2817                 :      * buffers we could scan before we'd catch up with it and "lap" it. Note:
    2818                 :      * weird-looking coding of xxx_passes comparisons are to avoid bogus
    2819                 :      * behavior when the passes counts wrap around.
    2820                 :      */
    2821 GIC       10438 :     if (saved_info_valid)
    2822                 :     {
    2823 CBC       10083 :         int32       passes_delta = strategy_passes - prev_strategy_passes;
    2824                 : 
    2825 GIC       10083 :         strategy_delta = strategy_buf_id - prev_strategy_buf_id;
    2826 CBC       10083 :         strategy_delta += (long) passes_delta * NBuffers;
    2827 ECB             : 
    2828 CBC       10083 :         Assert(strategy_delta >= 0);
    2829 ECB             : 
    2830 GIC       10083 :         if ((int32) (next_passes - strategy_passes) > 0)
    2831                 :         {
    2832                 :             /* we're one pass ahead of the strategy point */
    2833            2017 :             bufs_to_lap = strategy_buf_id - next_to_clean;
    2834                 : #ifdef BGW_DEBUG
    2835                 :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    2836                 :                  next_passes, next_to_clean,
    2837                 :                  strategy_passes, strategy_buf_id,
    2838                 :                  strategy_delta, bufs_to_lap);
    2839                 : #endif
    2840                 :         }
    2841            8066 :         else if (next_passes == strategy_passes &&
    2842            6768 :                  next_to_clean >= strategy_buf_id)
    2843                 :         {
    2844                 :             /* on same pass, but ahead or at least not behind */
    2845            6574 :             bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
    2846                 : #ifdef BGW_DEBUG
    2847                 :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    2848                 :                  next_passes, next_to_clean,
    2849                 :                  strategy_passes, strategy_buf_id,
    2850                 :                  strategy_delta, bufs_to_lap);
    2851                 : #endif
    2852                 :         }
    2853                 :         else
    2854 ECB             :         {
    2855                 :             /*
    2856                 :              * We're behind, so skip forward to the strategy point and start
    2857                 :              * cleaning from there.
    2858                 :              */
    2859                 : #ifdef BGW_DEBUG
    2860                 :             elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
    2861                 :                  next_passes, next_to_clean,
    2862                 :                  strategy_passes, strategy_buf_id,
    2863                 :                  strategy_delta);
    2864                 : #endif
    2865 GIC        1492 :             next_to_clean = strategy_buf_id;
    2866            1492 :             next_passes = strategy_passes;
    2867            1492 :             bufs_to_lap = NBuffers;
    2868                 :         }
    2869                 :     }
    2870                 :     else
    2871                 :     {
    2872                 :         /*
    2873                 :          * Initializing at startup or after LRU scanning had been off. Always
    2874                 :          * start at the strategy point.
    2875                 :          */
    2876                 : #ifdef BGW_DEBUG
    2877 ECB             :         elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
    2878                 :              strategy_passes, strategy_buf_id);
    2879                 : #endif
    2880 CBC         355 :         strategy_delta = 0;
    2881 GIC         355 :         next_to_clean = strategy_buf_id;
    2882 CBC         355 :         next_passes = strategy_passes;
    2883 GIC         355 :         bufs_to_lap = NBuffers;
    2884 ECB             :     }
    2885                 : 
    2886                 :     /* Update saved info for next time */
    2887 CBC       10438 :     prev_strategy_buf_id = strategy_buf_id;
    2888           10438 :     prev_strategy_passes = strategy_passes;
    2889 GIC       10438 :     saved_info_valid = true;
    2890                 : 
    2891                 :     /*
    2892                 :      * Compute how many buffers had to be scanned for each new allocation, ie,
    2893                 :      * 1/density of reusable buffers, and track a moving average of that.
    2894                 :      *
    2895                 :      * If the strategy point didn't move, we don't update the density estimate
    2896                 :      */
    2897 CBC       10438 :     if (strategy_delta > 0 && recent_alloc > 0)
    2898                 :     {
    2899 GIC        2053 :         scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
    2900 CBC        2053 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    2901                 :             smoothing_samples;
    2902 ECB             :     }
    2903                 : 
    2904                 :     /*
    2905                 :      * Estimate how many reusable buffers there are between the current
    2906                 :      * strategy point and where we've scanned ahead to, based on the smoothed
    2907                 :      * density estimate.
    2908                 :      */
    2909 GIC       10438 :     bufs_ahead = NBuffers - bufs_to_lap;
    2910 CBC       10438 :     reusable_buffers_est = (float) bufs_ahead / smoothed_density;
    2911 ECB             : 
    2912                 :     /*
    2913                 :      * Track a moving average of recent buffer allocations.  Here, rather than
    2914                 :      * a true average we want a fast-attack, slow-decline behavior: we
    2915                 :      * immediately follow any increase.
    2916                 :      */
    2917 GIC       10438 :     if (smoothed_alloc <= (float) recent_alloc)
    2918            2468 :         smoothed_alloc = recent_alloc;
    2919                 :     else
    2920            7970 :         smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
    2921                 :             smoothing_samples;
    2922                 : 
    2923                 :     /* Scale the estimate by a GUC to allow more aggressive tuning. */
    2924           10438 :     upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
    2925                 : 
    2926                 :     /*
    2927 ECB             :      * If recent_alloc remains at zero for many cycles, smoothed_alloc will
    2928                 :      * eventually underflow to zero, and the underflows produce annoying
    2929                 :      * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
    2930                 :      * zero, there's no point in tracking smaller and smaller values of
    2931                 :      * smoothed_alloc, so just reset it to exactly zero to avoid this
    2932                 :      * syndrome.  It will pop back up as soon as recent_alloc increases.
    2933                 :      */
    2934 GIC       10438 :     if (upcoming_alloc_est == 0)
    2935 CBC        1384 :         smoothed_alloc = 0;
    2936                 : 
    2937                 :     /*
    2938 ECB             :      * Even in cases where there's been little or no buffer allocation
    2939                 :      * activity, we want to make a small amount of progress through the buffer
    2940                 :      * cache so that as many reusable buffers as possible are clean after an
    2941                 :      * idle period.
    2942                 :      *
    2943                 :      * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
    2944                 :      * the BGW will be called during the scan_whole_pool time; slice the
    2945                 :      * buffer pool into that many sections.
    2946                 :      */
    2947 CBC       10438 :     min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
    2948                 : 
    2949 GIC       10438 :     if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
    2950                 :     {
    2951 ECB             : #ifdef BGW_DEBUG
    2952                 :         elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
    2953                 :              upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
    2954                 : #endif
    2955 GIC        5960 :         upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    2956                 :     }
    2957                 : 
    2958                 :     /*
    2959                 :      * Now write out dirty reusable buffers, working forward from the
    2960 ECB             :      * next_to_clean point, until we have lapped the strategy scan, or cleaned
    2961                 :      * enough buffers to match our estimate of the next cycle's allocation
    2962                 :      * requirements, or hit the bgwriter_lru_maxpages limit.
    2963                 :      */
    2964                 : 
    2965                 :     /* Make sure we can handle the pin inside SyncOneBuffer */
    2966 CBC       10438 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    2967                 : 
    2968           10438 :     num_to_scan = bufs_to_lap;
    2969           10438 :     num_written = 0;
    2970           10438 :     reusable_buffers = reusable_buffers_est;
    2971                 : 
    2972                 :     /* Execute the LRU scan */
    2973 GBC     1221474 :     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    2974                 :     {
    2975 CBC     1211036 :         int         sync_state = SyncOneBuffer(next_to_clean, true,
    2976                 :                                                wb_context);
    2977 ECB             : 
    2978 GIC     1211036 :         if (++next_to_clean >= NBuffers)
    2979                 :         {
    2980            1758 :             next_to_clean = 0;
    2981            1758 :             next_passes++;
    2982                 :         }
    2983         1211036 :         num_to_scan--;
    2984                 : 
    2985         1211036 :         if (sync_state & BUF_WRITTEN)
    2986                 :         {
    2987           13485 :             reusable_buffers++;
    2988           13485 :             if (++num_written >= bgwriter_lru_maxpages)
    2989                 :             {
    2990 UIC           0 :                 PendingBgWriterStats.maxwritten_clean++;
    2991               0 :                 break;
    2992                 :             }
    2993                 :         }
    2994 GIC     1197551 :         else if (sync_state & BUF_REUSABLE)
    2995          906569 :             reusable_buffers++;
    2996                 :     }
    2997 ECB             : 
    2998 GIC       10438 :     PendingBgWriterStats.buf_written_clean += num_written;
    2999                 : 
    3000                 : #ifdef BGW_DEBUG
    3001                 :     elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
    3002                 :          recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
    3003                 :          smoothed_density, reusable_buffers_est, upcoming_alloc_est,
    3004                 :          bufs_to_lap - num_to_scan,
    3005 ECB             :          num_written,
    3006                 :          reusable_buffers - reusable_buffers_est);
    3007                 : #endif
    3008                 : 
    3009                 :     /*
    3010                 :      * Consider the above scan as being like a new allocation scan.
    3011                 :      * Characterize its density and update the smoothed one based on it. This
    3012                 :      * effectively halves the moving average period in cases where both the
    3013                 :      * strategy and the background writer are doing some useful scanning,
    3014                 :      * which is helpful because a long memory isn't as desirable on the
    3015                 :      * density estimates.
    3016                 :      */
    3017 GIC       10438 :     new_strategy_delta = bufs_to_lap - num_to_scan;
    3018           10438 :     new_recent_alloc = reusable_buffers - reusable_buffers_est;
    3019           10438 :     if (new_strategy_delta > 0 && new_recent_alloc > 0)
    3020 ECB             :     {
    3021 GIC        7699 :         scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
    3022 CBC        7699 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    3023                 :             smoothing_samples;
    3024                 : 
    3025                 : #ifdef BGW_DEBUG
    3026                 :         elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
    3027                 :              new_recent_alloc, new_strategy_delta,
    3028                 :              scans_per_alloc, smoothed_density);
    3029                 : #endif
    3030                 :     }
    3031                 : 
    3032                 :     /* Return true if OK to hibernate */
    3033 GIC       10438 :     return (bufs_to_lap == 0 && recent_alloc == 0);
    3034                 : }
    3035                 : 
    3036                 : /*
    3037                 :  * SyncOneBuffer -- process a single buffer during syncing.
    3038                 :  *
    3039                 :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
    3040 ECB             :  * buffers marked recently used, as these are not replacement candidates.
    3041                 :  *
    3042                 :  * Returns a bitmask containing the following flag bits:
    3043                 :  *  BUF_WRITTEN: we wrote the buffer.
    3044                 :  *  BUF_REUSABLE: buffer is available for replacement, ie, it has
    3045                 :  *      pin count 0 and usage count 0.
    3046                 :  *
    3047                 :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
    3048                 :  * after locking it, but we don't care all that much.)
    3049                 :  *
    3050                 :  * Note: caller must have done ResourceOwnerEnlargeBuffers.
    3051                 :  */
    3052                 : static int
    3053 GIC     1685861 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
    3054                 : {
    3055 CBC     1685861 :     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    3056 GIC     1685861 :     int         result = 0;
    3057 ECB             :     uint32      buf_state;
    3058                 :     BufferTag   tag;
    3059                 : 
    3060 CBC     1685861 :     ReservePrivateRefCountEntry();
    3061 ECB             : 
    3062                 :     /*
    3063                 :      * Check whether buffer needs writing.
    3064                 :      *
    3065                 :      * We can make this check without taking the buffer content lock so long
    3066                 :      * as we mark pages dirty in access methods *before* logging changes with
    3067                 :      * XLogInsert(): if someone marks the buffer dirty just after our check we
    3068                 :      * don't worry because our checkpoint.redo points before log record for
    3069 EUB             :      * upcoming changes and so we are not required to write such dirty buffer.
    3070                 :      */
    3071 GIC     1685861 :     buf_state = LockBufHdr(bufHdr);
    3072 ECB             : 
    3073 CBC     1685861 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
    3074 GIC     1685075 :         BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    3075 ECB             :     {
    3076 GIC      920525 :         result |= BUF_REUSABLE;
    3077                 :     }
    3078          765336 :     else if (skip_recently_used)
    3079                 :     {
    3080                 :         /* Caller told us not to write recently-used buffers */
    3081          290982 :         UnlockBufHdr(bufHdr, buf_state);
    3082          290982 :         return result;
    3083                 :     }
    3084                 : 
    3085         1394879 :     if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
    3086 ECB             :     {
    3087                 :         /* It's clean, so nothing to do */
    3088 CBC      906569 :         UnlockBufHdr(bufHdr, buf_state);
    3089 GIC      906569 :         return result;
    3090                 :     }
    3091                 : 
    3092                 :     /*
    3093                 :      * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
    3094 ECB             :      * buffer is clean by the time we've locked it.)
    3095                 :      */
    3096 GIC      488310 :     PinBuffer_Locked(bufHdr);
    3097          488310 :     LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
    3098                 : 
    3099 GNC      488310 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    3100 ECB             : 
    3101 GIC      488310 :     LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
    3102                 : 
    3103          488310 :     tag = bufHdr->tag;
    3104                 : 
    3105 GNC      488310 :     UnpinBuffer(bufHdr);
    3106 ECB             : 
    3107 CBC      488310 :     ScheduleBufferTagForWriteback(wb_context, &tag);
    3108                 : 
    3109 GIC      488310 :     return result | BUF_WRITTEN;
    3110 ECB             : }
    3111                 : 
    3112                 : /*
    3113                 :  *      AtEOXact_Buffers - clean up at end of transaction.
    3114                 :  *
    3115                 :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    3116                 :  *      ResourceOwner mechanism.  This routine is just a debugging
    3117                 :  *      cross-check that no pins remain.
    3118                 :  */
    3119                 : void
    3120 GIC      486167 : AtEOXact_Buffers(bool isCommit)
    3121 ECB             : {
    3122 GIC      486167 :     CheckForBufferLeaks();
    3123 ECB             : 
    3124 CBC      486167 :     AtEOXact_LocalBuffers(isCommit);
    3125 ECB             : 
    3126 GIC      486167 :     Assert(PrivateRefCountOverflowed == 0);
    3127          486167 : }
    3128                 : 
    3129                 : /*
    3130                 :  * Initialize access to shared buffer pool
    3131                 :  *
    3132 ECB             :  * This is called during backend startup (whether standalone or under the
    3133                 :  * postmaster).  It sets up for this backend's access to the already-existing
    3134                 :  * buffer pool.
    3135                 :  */
    3136                 : void
    3137 GIC       13291 : InitBufferPoolAccess(void)
    3138                 : {
    3139 ECB             :     HASHCTL     hash_ctl;
    3140                 : 
    3141 GIC       13291 :     memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
    3142                 : 
    3143 CBC       13291 :     hash_ctl.keysize = sizeof(int32);
    3144 GIC       13291 :     hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
    3145                 : 
    3146 CBC       13291 :     PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
    3147                 :                                       HASH_ELEM | HASH_BLOBS);
    3148                 : 
    3149 ECB             :     /*
    3150 EUB             :      * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
    3151                 :      * the corresponding phase of backend shutdown.
    3152                 :      */
    3153 CBC       13291 :     Assert(MyProc != NULL);
    3154 GIC       13291 :     on_shmem_exit(AtProcExit_Buffers, 0);
    3155           13291 : }
    3156                 : 
    3157                 : /*
    3158                 :  * During backend exit, ensure that we released all shared-buffer locks and
    3159                 :  * assert that we have no remaining pins.
    3160 ECB             :  */
    3161                 : static void
    3162 GIC       13291 : AtProcExit_Buffers(int code, Datum arg)
    3163                 : {
    3164           13291 :     UnlockBuffers();
    3165 ECB             : 
    3166 GIC       13291 :     CheckForBufferLeaks();
    3167 ECB             : 
    3168                 :     /* localbuf.c needs a chance too */
    3169 CBC       13291 :     AtProcExit_LocalBuffers();
    3170 GIC       13291 : }
    3171                 : 
    3172 ECB             : /*
    3173                 :  *      CheckForBufferLeaks - ensure this backend holds no buffer pins
    3174                 :  *
    3175                 :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    3176                 :  *      ResourceOwner mechanism.  This routine is just a debugging
    3177                 :  *      cross-check that no pins remain.
    3178                 :  */
    3179                 : static void
    3180 CBC      499458 : CheckForBufferLeaks(void)
    3181 ECB             : {
    3182                 : #ifdef USE_ASSERT_CHECKING
    3183 GIC      499458 :     int         RefCountErrors = 0;
    3184 ECB             :     PrivateRefCountEntry *res;
    3185                 :     int         i;
    3186                 : 
    3187                 :     /* check the array */
    3188 CBC     4495122 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    3189 ECB             :     {
    3190 GIC     3995664 :         res = &PrivateRefCountArray[i];
    3191 ECB             : 
    3192 GIC     3995664 :         if (res->buffer != InvalidBuffer)
    3193 ECB             :         {
    3194 UIC           0 :             PrintBufferLeakWarning(res->buffer);
    3195               0 :             RefCountErrors++;
    3196                 :         }
    3197                 :     }
    3198                 : 
    3199                 :     /* if necessary search the hash */
    3200 GIC      499458 :     if (PrivateRefCountOverflowed)
    3201                 :     {
    3202                 :         HASH_SEQ_STATUS hstat;
    3203                 : 
    3204 UIC           0 :         hash_seq_init(&hstat, PrivateRefCountHash);
    3205               0 :         while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
    3206                 :         {
    3207 LBC           0 :             PrintBufferLeakWarning(res->buffer);
    3208 UIC           0 :             RefCountErrors++;
    3209 ECB             :         }
    3210                 :     }
    3211                 : 
    3212 CBC      499458 :     Assert(RefCountErrors == 0);
    3213 ECB             : #endif
    3214 GIC      499458 : }
    3215                 : 
    3216                 : /*
    3217                 :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
    3218                 :  */
    3219                 : void
    3220 UIC           0 : PrintBufferLeakWarning(Buffer buffer)
    3221 ECB             : {
    3222                 :     BufferDesc *buf;
    3223                 :     int32       loccount;
    3224                 :     char       *path;
    3225                 :     BackendId   backend;
    3226                 :     uint32      buf_state;
    3227                 : 
    3228 LBC           0 :     Assert(BufferIsValid(buffer));
    3229 UIC           0 :     if (BufferIsLocal(buffer))
    3230                 :     {
    3231               0 :         buf = GetLocalBufferDescriptor(-buffer - 1);
    3232               0 :         loccount = LocalRefCount[-buffer - 1];
    3233 LBC           0 :         backend = MyBackendId;
    3234                 :     }
    3235                 :     else
    3236                 :     {
    3237 UIC           0 :         buf = GetBufferDescriptor(buffer - 1);
    3238               0 :         loccount = GetPrivateRefCount(buffer);
    3239               0 :         backend = InvalidBackendId;
    3240                 :     }
    3241 ECB             : 
    3242                 :     /* theoretically we should lock the bufhdr here */
    3243 UNC           0 :     path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
    3244                 :                           BufTagGetForkNum(&buf->tag));
    3245 UIC           0 :     buf_state = pg_atomic_read_u32(&buf->state);
    3246 LBC           0 :     elog(WARNING,
    3247                 :          "buffer refcount leak: [%03d] "
    3248 ECB             :          "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
    3249                 :          buffer, path,
    3250                 :          buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
    3251                 :          BUF_STATE_GET_REFCOUNT(buf_state), loccount);
    3252 UIC           0 :     pfree(path);
    3253               0 : }
    3254                 : 
    3255                 : /*
    3256 ECB             :  * CheckPointBuffers
    3257                 :  *
    3258                 :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
    3259                 :  *
    3260                 :  * Note: temporary relations do not participate in checkpoints, so they don't
    3261                 :  * need to be flushed.
    3262                 :  */
    3263                 : void
    3264 GIC        2363 : CheckPointBuffers(int flags)
    3265                 : {
    3266            2363 :     BufferSync(flags);
    3267            2363 : }
    3268                 : 
    3269                 : /*
    3270                 :  * BufferGetBlockNumber
    3271                 :  *      Returns the block number associated with a buffer.
    3272                 :  *
    3273                 :  * Note:
    3274                 :  *      Assumes that the buffer is valid and pinned, else the
    3275                 :  *      value may be obsolete immediately...
    3276                 :  */
    3277                 : BlockNumber
    3278       104052285 : BufferGetBlockNumber(Buffer buffer)
    3279                 : {
    3280                 :     BufferDesc *bufHdr;
    3281                 : 
    3282       104052285 :     Assert(BufferIsPinned(buffer));
    3283                 : 
    3284       104052285 :     if (BufferIsLocal(buffer))
    3285 CBC     2225860 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    3286 ECB             :     else
    3287 GIC   101826425 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3288                 : 
    3289                 :     /* pinned, so OK to read tag without spinlock */
    3290       104052285 :     return bufHdr->tag.blockNum;
    3291                 : }
    3292                 : 
    3293                 : /*
    3294                 :  * BufferGetTag
    3295                 :  *      Returns the relfilelocator, fork number and block number associated with
    3296                 :  *      a buffer.
    3297                 :  */
    3298                 : void
    3299 GNC    23082246 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
    3300                 :              BlockNumber *blknum)
    3301                 : {
    3302                 :     BufferDesc *bufHdr;
    3303                 : 
    3304                 :     /* Do the same checks as BufferGetBlockNumber. */
    3305 GIC    23082246 :     Assert(BufferIsPinned(buffer));
    3306                 : 
    3307        23082246 :     if (BufferIsLocal(buffer))
    3308 UIC           0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    3309                 :     else
    3310 CBC    23082246 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3311                 : 
    3312                 :     /* pinned, so OK to read tag without spinlock */
    3313 GNC    23082246 :     *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    3314        23082246 :     *forknum = BufTagGetForkNum(&bufHdr->tag);
    3315 GIC    23082246 :     *blknum = bufHdr->tag.blockNum;
    3316        23082246 : }
    3317                 : 
    3318                 : /*
    3319                 :  * FlushBuffer
    3320 ECB             :  *      Physically write out a shared buffer.
    3321                 :  *
    3322 EUB             :  * NOTE: this actually just passes the buffer contents to the kernel; the
    3323                 :  * real write to disk won't happen until the kernel feels like it.  This
    3324                 :  * is okay from our point of view since we can redo the changes from WAL.
    3325                 :  * However, we will need to force the changes to disk via fsync before
    3326                 :  * we can checkpoint WAL.
    3327                 :  *
    3328                 :  * The caller must hold a pin on the buffer and have share-locked the
    3329                 :  * buffer contents.  (Note: a share-lock does not prevent updates of
    3330                 :  * hint bits in the buffer, so the page could change while the write
    3331                 :  * is in progress, but we assume that that will not invalidate the data
    3332                 :  * written.)
    3333                 :  *
    3334 ECB             :  * If the caller has an smgr reference for the buffer's relation, pass it
    3335                 :  * as the second parameter.  If not, pass NULL.
    3336                 :  */
    3337                 : static void
    3338 GNC      725820 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
    3339                 :             IOContext io_context)
    3340 ECB             : {
    3341                 :     XLogRecPtr  recptr;
    3342                 :     ErrorContextCallback errcallback;
    3343                 :     instr_time  io_start;
    3344                 :     Block       bufBlock;
    3345                 :     char       *bufToWrite;
    3346                 :     uint32      buf_state;
    3347                 : 
    3348                 :     /*
    3349                 :      * Try to start an I/O operation.  If StartBufferIO returns false, then
    3350                 :      * someone else flushed the buffer before we could, so we need not do
    3351                 :      * anything.
    3352                 :      */
    3353 GIC      725820 :     if (!StartBufferIO(buf, false))
    3354 LBC           0 :         return;
    3355 ECB             : 
    3356                 :     /* Setup error traceback support for ereport() */
    3357 GIC      725820 :     errcallback.callback = shared_buffer_write_error_callback;
    3358 CBC      725820 :     errcallback.arg = (void *) buf;
    3359 GIC      725820 :     errcallback.previous = error_context_stack;
    3360          725820 :     error_context_stack = &errcallback;
    3361                 : 
    3362                 :     /* Find smgr relation for buffer */
    3363          725820 :     if (reln == NULL)
    3364 GNC      722320 :         reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId);
    3365                 : 
    3366                 :     TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
    3367                 :                                         buf->tag.blockNum,
    3368                 :                                         reln->smgr_rlocator.locator.spcOid,
    3369                 :                                         reln->smgr_rlocator.locator.dbOid,
    3370                 :                                         reln->smgr_rlocator.locator.relNumber);
    3371                 : 
    3372 GIC      725820 :     buf_state = LockBufHdr(buf);
    3373                 : 
    3374                 :     /*
    3375                 :      * Run PageGetLSN while holding header lock, since we don't have the
    3376                 :      * buffer locked exclusively in all cases.
    3377                 :      */
    3378 CBC      725820 :     recptr = BufferGetLSN(buf);
    3379 ECB             : 
    3380                 :     /* To check if block content changes while flushing. - vadim 01/17/97 */
    3381 GIC      725820 :     buf_state &= ~BM_JUST_DIRTIED;
    3382          725820 :     UnlockBufHdr(buf, buf_state);
    3383                 : 
    3384                 :     /*
    3385                 :      * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
    3386                 :      * rule that log updates must hit disk before any of the data-file changes
    3387                 :      * they describe do.
    3388                 :      *
    3389                 :      * However, this rule does not apply to unlogged relations, which will be
    3390                 :      * lost after a crash anyway.  Most unlogged relation pages do not bear
    3391                 :      * LSNs since we never emit WAL records for them, and therefore flushing
    3392                 :      * up through the buffer LSN would be useless, but harmless.  However,
    3393 ECB             :      * GiST indexes use LSNs internally to track page-splits, and therefore
    3394                 :      * unlogged GiST pages bear "fake" LSNs generated by
    3395                 :      * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
    3396                 :      * LSN counter could advance past the WAL insertion point; and if it did
    3397                 :      * happen, attempting to flush WAL through that location would fail, with
    3398                 :      * disastrous system-wide consequences.  To make sure that can't happen,
    3399                 :      * skip the flush if the buffer isn't permanent.
    3400                 :      */
    3401 CBC      725820 :     if (buf_state & BM_PERMANENT)
    3402          723799 :         XLogFlush(recptr);
    3403                 : 
    3404                 :     /*
    3405                 :      * Now it's safe to write buffer to disk. Note that no one else should
    3406                 :      * have been able to write it while we were busy with log flushing because
    3407                 :      * only one process at a time can set the BM_IO_IN_PROGRESS bit.
    3408                 :      */
    3409 GIC      725820 :     bufBlock = BufHdrGetBlock(buf);
    3410 ECB             : 
    3411                 :     /*
    3412                 :      * Update page checksum if desired.  Since we have only shared lock on the
    3413                 :      * buffer, other processes might be updating hint bits in it, so we must
    3414                 :      * copy the page to private storage if we do checksumming.
    3415                 :      */
    3416 GIC      725820 :     bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
    3417                 : 
    3418 GNC      725820 :     io_start = pgstat_prepare_io_time();
    3419                 : 
    3420                 :     /*
    3421 ECB             :      * bufToWrite is either the shared buffer or a copy, as appropriate.
    3422                 :      */
    3423 GIC      725820 :     smgrwrite(reln,
    3424 GNC      725820 :               BufTagGetForkNum(&buf->tag),
    3425                 :               buf->tag.blockNum,
    3426                 :               bufToWrite,
    3427                 :               false);
    3428                 : 
    3429                 :     /*
    3430                 :      * When a strategy is in use, only flushes of dirty buffers already in the
    3431                 :      * strategy ring are counted as strategy writes (IOCONTEXT
    3432                 :      * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
    3433                 :      * statistics tracking.
    3434                 :      *
    3435                 :      * If a shared buffer initially added to the ring must be flushed before
    3436                 :      * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
    3437                 :      *
    3438                 :      * If a shared buffer which was added to the ring later because the
    3439                 :      * current strategy buffer is pinned or in use or because all strategy
    3440                 :      * buffers were dirty and rejected (for BAS_BULKREAD operations only)
    3441                 :      * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
    3442                 :      * (from_ring will be false).
    3443                 :      *
    3444                 :      * When a strategy is not in use, the write can only be a "regular" write
    3445                 :      * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
    3446                 :      */
    3447          725820 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    3448                 :                             IOOP_WRITE, io_start, 1);
    3449 ECB             : 
    3450 GIC      725820 :     pgBufferUsage.shared_blks_written++;
    3451                 : 
    3452                 :     /*
    3453                 :      * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
    3454                 :      * end the BM_IO_IN_PROGRESS state.
    3455                 :      */
    3456          725820 :     TerminateBufferIO(buf, true, 0);
    3457                 : 
    3458                 :     TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
    3459 ECB             :                                        buf->tag.blockNum,
    3460                 :                                        reln->smgr_rlocator.locator.spcOid,
    3461                 :                                        reln->smgr_rlocator.locator.dbOid,
    3462                 :                                        reln->smgr_rlocator.locator.relNumber);
    3463                 : 
    3464                 :     /* Pop the error context stack */
    3465 GIC      725820 :     error_context_stack = errcallback.previous;
    3466                 : }
    3467                 : 
    3468                 : /*
    3469                 :  * RelationGetNumberOfBlocksInFork
    3470                 :  *      Determines the current number of pages in the specified relation fork.
    3471                 :  *
    3472 ECB             :  * Note that the accuracy of the result will depend on the details of the
    3473                 :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
    3474                 :  * it might not be.
    3475                 :  */
    3476                 : BlockNumber
    3477 GIC     2074269 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
    3478                 : {
    3479         2074269 :     if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
    3480 ECB             :     {
    3481                 :         /*
    3482                 :          * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
    3483                 :          * tableam returns the size in bytes - but for the purpose of this
    3484                 :          * routine, we want the number of blocks. Therefore divide, rounding
    3485                 :          * up.
    3486                 :          */
    3487                 :         uint64      szbytes;
    3488                 : 
    3489 GIC     1614501 :         szbytes = table_relation_size(relation, forkNum);
    3490                 : 
    3491 CBC     1614483 :         return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
    3492                 :     }
    3493          459768 :     else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
    3494 ECB             :     {
    3495 CBC      459768 :         return smgrnblocks(RelationGetSmgr(relation), forkNum);
    3496                 :     }
    3497                 :     else
    3498 LBC           0 :         Assert(false);
    3499                 : 
    3500 ECB             :     return 0;                   /* keep compiler quiet */
    3501                 : }
    3502                 : 
    3503                 : /*
    3504                 :  * BufferIsPermanent
    3505                 :  *      Determines whether a buffer will potentially still be around after
    3506                 :  *      a crash.  Caller must hold a buffer pin.
    3507                 :  */
    3508                 : bool
    3509 GIC    15775629 : BufferIsPermanent(Buffer buffer)
    3510 ECB             : {
    3511                 :     BufferDesc *bufHdr;
    3512                 : 
    3513                 :     /* Local buffers are used only for temp relations. */
    3514 GIC    15775629 :     if (BufferIsLocal(buffer))
    3515 GBC      672453 :         return false;
    3516 EUB             : 
    3517                 :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    3518 GIC    15103176 :     Assert(BufferIsValid(buffer));
    3519 CBC    15103176 :     Assert(BufferIsPinned(buffer));
    3520 ECB             : 
    3521                 :     /*
    3522                 :      * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
    3523                 :      * need not bother with the buffer header spinlock.  Even if someone else
    3524                 :      * changes the buffer header state while we're doing this, the state is
    3525                 :      * changed atomically, so we'll read the old value or the new value, but
    3526                 :      * not random garbage.
    3527                 :      */
    3528 GIC    15103176 :     bufHdr = GetBufferDescriptor(buffer - 1);
    3529        15103176 :     return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
    3530                 : }
    3531                 : 
    3532                 : /*
    3533                 :  * BufferGetLSNAtomic
    3534                 :  *      Retrieves the LSN of the buffer atomically using a buffer header lock.
    3535                 :  *      This is necessary for some callers who may not have an exclusive lock
    3536                 :  *      on the buffer.
    3537                 :  */
    3538                 : XLogRecPtr
    3539        10063084 : BufferGetLSNAtomic(Buffer buffer)
    3540                 : {
    3541        10063084 :     BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
    3542 CBC    10063084 :     char       *page = BufferGetPage(buffer);
    3543 ECB             :     XLogRecPtr  lsn;
    3544                 :     uint32      buf_state;
    3545                 : 
    3546                 :     /*
    3547                 :      * If we don't need locking for correctness, fastpath out.
    3548                 :      */
    3549 GIC    10063084 :     if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
    3550         8369376 :         return PageGetLSN(page);
    3551                 : 
    3552                 :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    3553         1693708 :     Assert(BufferIsValid(buffer));
    3554         1693708 :     Assert(BufferIsPinned(buffer));
    3555                 : 
    3556         1693708 :     buf_state = LockBufHdr(bufHdr);
    3557         1693708 :     lsn = PageGetLSN(page);
    3558 CBC     1693708 :     UnlockBufHdr(bufHdr, buf_state);
    3559                 : 
    3560 GIC     1693708 :     return lsn;
    3561                 : }
    3562                 : 
    3563                 : /* ---------------------------------------------------------------------
    3564                 :  *      DropRelationBuffers
    3565                 :  *
    3566                 :  *      This function removes from the buffer pool all the pages of the
    3567                 :  *      specified relation forks that have block numbers >= firstDelBlock.
    3568                 :  *      (In particular, with firstDelBlock = 0, all pages are removed.)
    3569                 :  *      Dirty pages are simply dropped, without bothering to write them
    3570                 :  *      out first.  Therefore, this is NOT rollback-able, and so should be
    3571                 :  *      used only with extreme caution!
    3572                 :  *
    3573                 :  *      Currently, this is called only from smgr.c when the underlying file
    3574                 :  *      is about to be deleted or truncated (firstDelBlock is needed for
    3575                 :  *      the truncation case).  The data in the affected pages would therefore
    3576                 :  *      be deleted momentarily anyway, and there is no point in writing it.
    3577                 :  *      It is the responsibility of higher-level code to ensure that the
    3578 ECB             :  *      deletion or truncation does not lose any data that could be needed
    3579                 :  *      later.  It is also the responsibility of higher-level code to ensure
    3580                 :  *      that no other process could be trying to load more pages of the
    3581                 :  *      relation into buffers.
    3582                 :  * --------------------------------------------------------------------
    3583                 :  */
    3584                 : void
    3585 GNC         494 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
    3586                 :                     int nforks, BlockNumber *firstDelBlock)
    3587                 : {
    3588                 :     int         i;
    3589                 :     int         j;
    3590                 :     RelFileLocatorBackend rlocator;
    3591                 :     BlockNumber nForkBlock[MAX_FORKNUM];
    3592 GIC         494 :     uint64      nBlocksToInvalidate = 0;
    3593                 : 
    3594 GNC         494 :     rlocator = smgr_reln->smgr_rlocator;
    3595                 : 
    3596 ECB             :     /* If it's a local relation, it's localbuf.c's problem. */
    3597 GNC         494 :     if (RelFileLocatorBackendIsTemp(rlocator))
    3598 ECB             :     {
    3599 GNC         329 :         if (rlocator.backend == MyBackendId)
    3600                 :         {
    3601 CBC         675 :             for (j = 0; j < nforks; j++)
    3602 GNC         346 :                 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
    3603             346 :                                          firstDelBlock[j]);
    3604                 :         }
    3605 GIC         363 :         return;
    3606 ECB             :     }
    3607                 : 
    3608                 :     /*
    3609                 :      * To remove all the pages of the specified relation forks from the buffer
    3610                 :      * pool, we need to scan the entire buffer pool but we can optimize it by
    3611                 :      * finding the buffers from BufMapping table provided we know the exact
    3612                 :      * size of each fork of the relation. The exact size is required to ensure
    3613                 :      * that we don't leave any buffer for the relation being dropped as
    3614                 :      * otherwise the background writer or checkpointer can lead to a PANIC
    3615                 :      * error while flushing buffers corresponding to files that don't exist.
    3616                 :      *
    3617                 :      * To know the exact size, we rely on the size cached for each fork by us
    3618                 :      * during recovery which limits the optimization to recovery and on
    3619                 :      * standbys but we can easily extend it once we have shared cache for
    3620                 :      * relation size.
    3621                 :      *
    3622                 :      * In recovery, we cache the value returned by the first lseek(SEEK_END)
    3623                 :      * and the future writes keeps the cached value up-to-date. See
    3624                 :      * smgrextend. It is possible that the value of the first lseek is smaller
    3625                 :      * than the actual number of existing blocks in the file due to buggy
    3626                 :      * Linux kernels that might not have accounted for the recent write. But
    3627                 :      * that should be fine because there must not be any buffers after that
    3628                 :      * file size.
    3629                 :      */
    3630 CBC         234 :     for (i = 0; i < nforks; i++)
    3631                 :     {
    3632 ECB             :         /* Get the number of blocks for a relation's fork */
    3633 GIC         195 :         nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
    3634 ECB             : 
    3635 GIC         195 :         if (nForkBlock[i] == InvalidBlockNumber)
    3636                 :         {
    3637             126 :             nBlocksToInvalidate = InvalidBlockNumber;
    3638             126 :             break;
    3639                 :         }
    3640                 : 
    3641                 :         /* calculate the number of blocks to be invalidated */
    3642              69 :         nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
    3643                 :     }
    3644                 : 
    3645 ECB             :     /*
    3646                 :      * We apply the optimization iff the total number of blocks to invalidate
    3647                 :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    3648                 :      */
    3649 CBC         165 :     if (BlockNumberIsValid(nBlocksToInvalidate) &&
    3650 GIC          39 :         nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    3651 ECB             :     {
    3652 CBC          91 :         for (j = 0; j < nforks; j++)
    3653 GNC          57 :             FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
    3654              57 :                                        nForkBlock[j], firstDelBlock[j]);
    3655 GIC          34 :         return;
    3656                 :     }
    3657                 : 
    3658         1658755 :     for (i = 0; i < NBuffers; i++)
    3659                 :     {
    3660         1658624 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    3661                 :         uint32      buf_state;
    3662 ECB             : 
    3663                 :         /*
    3664                 :          * We can make this a tad faster by prechecking the buffer tag before
    3665                 :          * we attempt to lock the buffer; this saves a lot of lock
    3666                 :          * acquisitions in typical cases.  It should be safe because the
    3667                 :          * caller must have AccessExclusiveLock on the relation, or some other
    3668                 :          * reason to be certain that no one is loading new pages of the rel
    3669                 :          * into the buffer pool.  (Otherwise we might well miss such pages
    3670                 :          * entirely.)  Therefore, while the tag might be changing while we
    3671                 :          * look at it, it can't be changing *to* a value we care about, only
    3672                 :          * *away* from such a value.  So false negatives are impossible, and
    3673                 :          * false positives are safe because we'll recheck after getting the
    3674                 :          * buffer lock.
    3675                 :          *
    3676                 :          * We could check forkNum and blockNum as well as the rlocator, but
    3677                 :          * the incremental win from doing so seems small.
    3678                 :          */
    3679 GNC     1658624 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
    3680 CBC     1657013 :             continue;
    3681                 : 
    3682 GIC        1611 :         buf_state = LockBufHdr(bufHdr);
    3683                 : 
    3684            3479 :         for (j = 0; j < nforks; j++)
    3685                 :         {
    3686 GNC        2648 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
    3687            2648 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
    3688 GIC        1562 :                 bufHdr->tag.blockNum >= firstDelBlock[j])
    3689 ECB             :             {
    3690 GIC         780 :                 InvalidateBuffer(bufHdr);   /* releases spinlock */
    3691 CBC         780 :                 break;
    3692                 :             }
    3693                 :         }
    3694            1611 :         if (j >= nforks)
    3695             831 :             UnlockBufHdr(bufHdr, buf_state);
    3696                 :     }
    3697                 : }
    3698                 : 
    3699                 : /* ---------------------------------------------------------------------
    3700                 :  *      DropRelationsAllBuffers
    3701                 :  *
    3702                 :  *      This function removes from the buffer pool all the pages of all
    3703                 :  *      forks of the specified relations.  It's equivalent to calling
    3704                 :  *      DropRelationBuffers once per fork per relation with firstDelBlock = 0.
    3705                 :  *      --------------------------------------------------------------------
    3706                 :  */
    3707 ECB             : void
    3708 GNC       10663 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
    3709                 : {
    3710                 :     int         i;
    3711 CBC       10663 :     int         n = 0;
    3712                 :     SMgrRelation *rels;
    3713 ECB             :     BlockNumber (*block)[MAX_FORKNUM + 1];
    3714 GIC       10663 :     uint64      nBlocksToInvalidate = 0;
    3715                 :     RelFileLocator *locators;
    3716           10663 :     bool        cached = true;
    3717 EUB             :     bool        use_bsearch;
    3718                 : 
    3719 GNC       10663 :     if (nlocators == 0)
    3720 UIC           0 :         return;
    3721                 : 
    3722 GNC       10663 :     rels = palloc(sizeof(SMgrRelation) * nlocators);    /* non-local relations */
    3723 ECB             : 
    3724                 :     /* If it's a local relation, it's localbuf.c's problem. */
    3725 GNC       48045 :     for (i = 0; i < nlocators; i++)
    3726                 :     {
    3727           37382 :         if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
    3728 EUB             :         {
    3729 GNC        2808 :             if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
    3730            2808 :                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
    3731 EUB             :         }
    3732                 :         else
    3733 GIC       34574 :             rels[n++] = smgr_reln[i];
    3734                 :     }
    3735 ECB             : 
    3736                 :     /*
    3737                 :      * If there are no non-local relations, then we're done. Release the
    3738                 :      * memory and return.
    3739                 :      */
    3740 GIC       10663 :     if (n == 0)
    3741                 :     {
    3742             700 :         pfree(rels);
    3743 GBC         700 :         return;
    3744                 :     }
    3745                 : 
    3746                 :     /*
    3747                 :      * This is used to remember the number of blocks for all the relations
    3748                 :      * forks.
    3749                 :      */
    3750                 :     block = (BlockNumber (*)[MAX_FORKNUM + 1])
    3751            9963 :         palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
    3752 EUB             : 
    3753                 :     /*
    3754                 :      * We can avoid scanning the entire buffer pool if we know the exact size
    3755                 :      * of each of the given relation forks. See DropRelationBuffers.
    3756                 :      */
    3757 GIC       20986 :     for (i = 0; i < n && cached; i++)
    3758                 :     {
    3759 GNC       18021 :         for (int j = 0; j <= MAX_FORKNUM; j++)
    3760 EUB             :         {
    3761                 :             /* Get the number of blocks for a relation's fork. */
    3762 GBC       16281 :             block[i][j] = smgrnblocks_cached(rels[i], j);
    3763                 : 
    3764                 :             /* We need to only consider the relation forks that exists. */
    3765 GIC       16281 :             if (block[i][j] == InvalidBlockNumber)
    3766 EUB             :             {
    3767 GIC       14398 :                 if (!smgrexists(rels[i], j))
    3768 GBC        5115 :                     continue;
    3769            9283 :                 cached = false;
    3770 GIC        9283 :                 break;
    3771                 :             }
    3772                 : 
    3773                 :             /* calculate the total number of blocks to be invalidated */
    3774            1883 :             nBlocksToInvalidate += block[i][j];
    3775 EUB             :         }
    3776                 :     }
    3777                 : 
    3778                 :     /*
    3779                 :      * We apply the optimization iff the total number of blocks to invalidate
    3780                 :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    3781                 :      */
    3782 GIC        9963 :     if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    3783                 :     {
    3784            1107 :         for (i = 0; i < n; i++)
    3785                 :         {
    3786 GNC        3050 :             for (int j = 0; j <= MAX_FORKNUM; j++)
    3787 ECB             :             {
    3788                 :                 /* ignore relation forks that doesn't exist */
    3789 CBC        2440 :                 if (!BlockNumberIsValid(block[i][j]))
    3790            1829 :                     continue;
    3791                 : 
    3792                 :                 /* drop all the buffers for a particular relation fork */
    3793 GNC         611 :                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
    3794             611 :                                            j, block[i][j], 0);
    3795                 :             }
    3796                 :         }
    3797                 : 
    3798 GIC         497 :         pfree(block);
    3799             497 :         pfree(rels);
    3800             497 :         return;
    3801 ECB             :     }
    3802                 : 
    3803 GIC        9466 :     pfree(block);
    3804 GNC        9466 :     locators = palloc(sizeof(RelFileLocator) * n);  /* non-local relations */
    3805 CBC       43430 :     for (i = 0; i < n; i++)
    3806 GNC       33964 :         locators[i] = rels[i]->smgr_rlocator.locator;
    3807 ECB             : 
    3808                 :     /*
    3809                 :      * For low number of relations to drop just use a simple walk through, to
    3810                 :      * save the bsearch overhead. The threshold to use is rather a guess than
    3811                 :      * an exactly determined value, as it depends on many factors (CPU and RAM
    3812                 :      * speeds, amount of shared buffers etc.).
    3813                 :      */
    3814 GIC        9466 :     use_bsearch = n > RELS_BSEARCH_THRESHOLD;
    3815                 : 
    3816                 :     /* sort the list of rlocators if necessary */
    3817            9466 :     if (use_bsearch)
    3818 GNC         165 :         pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
    3819                 : 
    3820 GIC   102040826 :     for (i = 0; i < NBuffers; i++)
    3821                 :     {
    3822 GNC   102031360 :         RelFileLocator *rlocator = NULL;
    3823 GIC   102031360 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    3824                 :         uint32      buf_state;
    3825                 : 
    3826                 :         /*
    3827                 :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    3828                 :          * saves some cycles.
    3829                 :          */
    3830 ECB             : 
    3831 GBC   102031360 :         if (!use_bsearch)
    3832                 :         {
    3833 ECB             :             int         j;
    3834                 : 
    3835 GIC   413532505 :             for (j = 0; j < n; j++)
    3836 ECB             :             {
    3837 GNC   313271388 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
    3838 ECB             :                 {
    3839 GNC       74755 :                     rlocator = &locators[j];
    3840 GIC       74755 :                     break;
    3841                 :                 }
    3842                 :             }
    3843                 :         }
    3844                 :         else
    3845                 :         {
    3846                 :             RelFileLocator locator;
    3847                 : 
    3848 GNC     1695488 :             locator = BufTagGetRelFileLocator(&bufHdr->tag);
    3849         1695488 :             rlocator = bsearch((const void *) &(locator),
    3850                 :                                locators, n, sizeof(RelFileLocator),
    3851                 :                                rlocator_comparator);
    3852                 :         }
    3853                 : 
    3854                 :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    3855       102031360 :         if (rlocator == NULL)
    3856 GIC   101954940 :             continue;
    3857                 : 
    3858           76420 :         buf_state = LockBufHdr(bufHdr);
    3859 GNC       76420 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
    3860 GIC       76420 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    3861                 :         else
    3862 UIC           0 :             UnlockBufHdr(bufHdr, buf_state);
    3863                 :     }
    3864 ECB             : 
    3865 GNC        9466 :     pfree(locators);
    3866 GIC        9466 :     pfree(rels);
    3867                 : }
    3868                 : 
    3869                 : /* ---------------------------------------------------------------------
    3870                 :  *      FindAndDropRelationBuffers
    3871                 :  *
    3872                 :  *      This function performs look up in BufMapping table and removes from the
    3873                 :  *      buffer pool all the pages of the specified relation fork that has block
    3874                 :  *      number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
    3875                 :  *      pages are removed.)
    3876                 :  * --------------------------------------------------------------------
    3877                 :  */
    3878                 : static void
    3879 GNC         668 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
    3880                 :                            BlockNumber nForkBlock,
    3881                 :                            BlockNumber firstDelBlock)
    3882                 : {
    3883 ECB             :     BlockNumber curBlock;
    3884                 : 
    3885 CBC        1612 :     for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
    3886 ECB             :     {
    3887                 :         uint32      bufHash;    /* hash value for tag */
    3888                 :         BufferTag   bufTag;     /* identity of requested block */
    3889                 :         LWLock     *bufPartitionLock;   /* buffer partition lock for it */
    3890                 :         int         buf_id;
    3891                 :         BufferDesc *bufHdr;
    3892                 :         uint32      buf_state;
    3893                 : 
    3894                 :         /* create a tag so we can lookup the buffer */
    3895 GNC         944 :         InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
    3896                 : 
    3897                 :         /* determine its hash code and partition lock ID */
    3898 CBC         944 :         bufHash = BufTableHashCode(&bufTag);
    3899 GIC         944 :         bufPartitionLock = BufMappingPartitionLock(bufHash);
    3900                 : 
    3901                 :         /* Check that it is in the buffer pool. If not, do nothing. */
    3902             944 :         LWLockAcquire(bufPartitionLock, LW_SHARED);
    3903             944 :         buf_id = BufTableLookup(&bufTag, bufHash);
    3904 CBC         944 :         LWLockRelease(bufPartitionLock);
    3905                 : 
    3906 GIC         944 :         if (buf_id < 0)
    3907 CBC         120 :             continue;
    3908 ECB             : 
    3909 GIC         824 :         bufHdr = GetBufferDescriptor(buf_id);
    3910                 : 
    3911                 :         /*
    3912                 :          * We need to lock the buffer header and recheck if the buffer is
    3913                 :          * still associated with the same block because the buffer could be
    3914                 :          * evicted by some other backend loading blocks for a different
    3915                 :          * relation after we release lock on the BufMapping table.
    3916                 :          */
    3917             824 :         buf_state = LockBufHdr(bufHdr);
    3918                 : 
    3919 GNC        1648 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
    3920             824 :             BufTagGetForkNum(&bufHdr->tag) == forkNum &&
    3921 GIC         824 :             bufHdr->tag.blockNum >= firstDelBlock)
    3922             824 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    3923                 :         else
    3924 UIC           0 :             UnlockBufHdr(bufHdr, buf_state);
    3925                 :     }
    3926 GIC         668 : }
    3927 ECB             : 
    3928                 : /* ---------------------------------------------------------------------
    3929                 :  *      DropDatabaseBuffers
    3930                 :  *
    3931                 :  *      This function removes all the buffers in the buffer cache for a
    3932                 :  *      particular database.  Dirty pages are simply dropped, without
    3933                 :  *      bothering to write them out first.  This is used when we destroy a
    3934                 :  *      database, to avoid trying to flush data to disk when the directory
    3935                 :  *      tree no longer exists.  Implementation is pretty similar to
    3936                 :  *      DropRelationBuffers() which is for destroying just one relation.
    3937                 :  * --------------------------------------------------------------------
    3938                 :  */
    3939                 : void
    3940 GIC          34 : DropDatabaseBuffers(Oid dbid)
    3941                 : {
    3942 ECB             :     int         i;
    3943                 : 
    3944                 :     /*
    3945                 :      * We needn't consider local buffers, since by assumption the target
    3946                 :      * database isn't our own.
    3947                 :      */
    3948                 : 
    3949 CBC      118178 :     for (i = 0; i < NBuffers; i++)
    3950 ECB             :     {
    3951 GIC      118144 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    3952                 :         uint32      buf_state;
    3953                 : 
    3954                 :         /*
    3955                 :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    3956                 :          * saves some cycles.
    3957                 :          */
    3958 GNC      118144 :         if (bufHdr->tag.dbOid != dbid)
    3959 GIC      112397 :             continue;
    3960                 : 
    3961            5747 :         buf_state = LockBufHdr(bufHdr);
    3962 GNC        5747 :         if (bufHdr->tag.dbOid == dbid)
    3963 GIC        5747 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    3964                 :         else
    3965 UIC           0 :             UnlockBufHdr(bufHdr, buf_state);
    3966                 :     }
    3967 GIC          34 : }
    3968                 : 
    3969                 : /* -----------------------------------------------------------------
    3970                 :  *      PrintBufferDescs
    3971                 :  *
    3972                 :  *      this function prints all the buffer descriptors, for debugging
    3973 ECB             :  *      use only.
    3974                 :  * -----------------------------------------------------------------
    3975                 :  */
    3976                 : #ifdef NOT_USED
    3977                 : void
    3978                 : PrintBufferDescs(void)
    3979                 : {
    3980                 :     int         i;
    3981                 : 
    3982                 :     for (i = 0; i < NBuffers; ++i)
    3983                 :     {
    3984                 :         BufferDesc *buf = GetBufferDescriptor(i);
    3985                 :         Buffer      b = BufferDescriptorGetBuffer(buf);
    3986                 : 
    3987                 :         /* theoretically we should lock the bufhdr here */
    3988                 :         elog(LOG,
    3989                 :              "[%02d] (freeNext=%d, rel=%s, "
    3990                 :              "blockNum=%u, flags=0x%x, refcount=%u %d)",
    3991                 :              i, buf->freeNext,
    3992                 :              relpathbackend(BufTagGetRelFileLocator(&buf->tag),
    3993                 :                             InvalidBackendId, BufTagGetForkNum(&buf->tag)),
    3994                 :              buf->tag.blockNum, buf->flags,
    3995                 :              buf->refcount, GetPrivateRefCount(b));
    3996                 :     }
    3997                 : }
    3998                 : #endif
    3999                 : 
    4000                 : #ifdef NOT_USED
    4001                 : void
    4002                 : PrintPinnedBufs(void)
    4003                 : {
    4004                 :     int         i;
    4005                 : 
    4006                 :     for (i = 0; i < NBuffers; ++i)
    4007                 :     {
    4008                 :         BufferDesc *buf = GetBufferDescriptor(i);
    4009                 :         Buffer      b = BufferDescriptorGetBuffer(buf);
    4010                 : 
    4011                 :         if (GetPrivateRefCount(b) > 0)
    4012                 :         {
    4013                 :             /* theoretically we should lock the bufhdr here */
    4014                 :             elog(LOG,
    4015                 :                  "[%02d] (freeNext=%d, rel=%s, "
    4016                 :                  "blockNum=%u, flags=0x%x, refcount=%u %d)",
    4017                 :                  i, buf->freeNext,
    4018                 :                  relpathperm(BufTagGetRelFileLocator(&buf->tag),
    4019                 :                              BufTagGetForkNum(&buf->tag)),
    4020                 :                  buf->tag.blockNum, buf->flags,
    4021                 :                  buf->refcount, GetPrivateRefCount(b));
    4022                 :         }
    4023                 :     }
    4024                 : }
    4025                 : #endif
    4026 EUB             : 
    4027                 : /* ---------------------------------------------------------------------
    4028                 :  *      FlushRelationBuffers
    4029                 :  *
    4030                 :  *      This function writes all dirty pages of a relation out to disk
    4031                 :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    4032                 :  *      kernel has an up-to-date view of the relation.
    4033                 :  *
    4034                 :  *      Generally, the caller should be holding AccessExclusiveLock on the
    4035                 :  *      target relation to ensure that no other backend is busy dirtying
    4036                 :  *      more blocks of the relation; the effects can't be expected to last
    4037 ECB             :  *      after the lock is released.
    4038                 :  *
    4039                 :  *      XXX currently it sequentially searches the buffer pool, should be
    4040                 :  *      changed to more clever ways of searching.  This routine is not
    4041                 :  *      used in any performance-critical code paths, so it's not worth
    4042                 :  *      adding additional overhead to normal paths to make it go faster.
    4043                 :  * --------------------------------------------------------------------
    4044                 :  */
    4045                 : void
    4046 CBC         115 : FlushRelationBuffers(Relation rel)
    4047 ECB             : {
    4048                 :     int         i;
    4049                 :     BufferDesc *bufHdr;
    4050                 : 
    4051 GIC         115 :     if (RelationUsesLocalBuffers(rel))
    4052                 :     {
    4053             909 :         for (i = 0; i < NLocBuffer; i++)
    4054                 :         {
    4055                 :             uint32      buf_state;
    4056                 :             instr_time  io_start;
    4057 ECB             : 
    4058 CBC         900 :             bufHdr = GetLocalBufferDescriptor(i);
    4059 GNC         900 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    4060 GIC         300 :                 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
    4061                 :                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    4062                 :             {
    4063                 :                 ErrorContextCallback errcallback;
    4064                 :                 Page        localpage;
    4065                 : 
    4066             297 :                 localpage = (char *) LocalBufHdrGetBlock(bufHdr);
    4067                 : 
    4068 ECB             :                 /* Setup error traceback support for ereport() */
    4069 GIC         297 :                 errcallback.callback = local_buffer_write_error_callback;
    4070 CBC         297 :                 errcallback.arg = (void *) bufHdr;
    4071             297 :                 errcallback.previous = error_context_stack;
    4072 GIC         297 :                 error_context_stack = &errcallback;
    4073                 : 
    4074             297 :                 PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
    4075                 : 
    4076 GNC         297 :                 io_start = pgstat_prepare_io_time();
    4077                 : 
    4078 GIC         297 :                 smgrwrite(RelationGetSmgr(rel),
    4079 GNC         297 :                           BufTagGetForkNum(&bufHdr->tag),
    4080 ECB             :                           bufHdr->tag.blockNum,
    4081                 :                           localpage,
    4082                 :                           false);
    4083                 : 
    4084 GNC         297 :                 pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
    4085                 :                                         IOCONTEXT_NORMAL, IOOP_WRITE,
    4086                 :                                         io_start, 1);
    4087                 : 
    4088 CBC         297 :                 buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
    4089             297 :                 pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
    4090                 : 
    4091 GNC         297 :                 pgBufferUsage.local_blks_written++;
    4092                 : 
    4093 ECB             :                 /* Pop the error context stack */
    4094 CBC         297 :                 error_context_stack = errcallback.previous;
    4095 ECB             :             }
    4096                 :         }
    4097                 : 
    4098 GIC           9 :         return;
    4099                 :     }
    4100                 : 
    4101                 :     /* Make sure we can handle the pin inside the loop */
    4102             106 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    4103                 : 
    4104         1249130 :     for (i = 0; i < NBuffers; i++)
    4105                 :     {
    4106                 :         uint32      buf_state;
    4107                 : 
    4108         1249024 :         bufHdr = GetBufferDescriptor(i);
    4109                 : 
    4110                 :         /*
    4111                 :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    4112                 :          * saves some cycles.
    4113                 :          */
    4114 GNC     1249024 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
    4115 GIC     1248867 :             continue;
    4116                 : 
    4117             157 :         ReservePrivateRefCountEntry();
    4118                 : 
    4119             157 :         buf_state = LockBufHdr(bufHdr);
    4120 GNC         157 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    4121 GIC         157 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    4122 ECB             :         {
    4123 GIC         133 :             PinBuffer_Locked(bufHdr);
    4124             133 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
    4125 GNC         133 :             FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4126 GIC         133 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
    4127 GNC         133 :             UnpinBuffer(bufHdr);
    4128                 :         }
    4129 ECB             :         else
    4130 GIC          24 :             UnlockBufHdr(bufHdr, buf_state);
    4131 ECB             :     }
    4132                 : }
    4133                 : 
    4134                 : /* ---------------------------------------------------------------------
    4135                 :  *      FlushRelationsAllBuffers
    4136                 :  *
    4137                 :  *      This function flushes out of the buffer pool all the pages of all
    4138                 :  *      forks of the specified smgr relations.  It's equivalent to calling
    4139                 :  *      FlushRelationBuffers once per relation.  The relations are assumed not
    4140                 :  *      to use local buffers.
    4141                 :  * --------------------------------------------------------------------
    4142                 :  */
    4143                 : void
    4144 GIC           9 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
    4145                 : {
    4146                 :     int         i;
    4147                 :     SMgrSortArray *srels;
    4148                 :     bool        use_bsearch;
    4149                 : 
    4150               9 :     if (nrels == 0)
    4151 UIC           0 :         return;
    4152                 : 
    4153                 :     /* fill-in array for qsort */
    4154 GIC           9 :     srels = palloc(sizeof(SMgrSortArray) * nrels);
    4155                 : 
    4156              18 :     for (i = 0; i < nrels; i++)
    4157                 :     {
    4158 GNC           9 :         Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
    4159                 : 
    4160               9 :         srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
    4161 GIC           9 :         srels[i].srel = smgrs[i];
    4162                 :     }
    4163                 : 
    4164                 :     /*
    4165                 :      * Save the bsearch overhead for low number of relations to sync. See
    4166                 :      * DropRelationsAllBuffers for details.
    4167 ECB             :      */
    4168 GIC           9 :     use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
    4169                 : 
    4170 ECB             :     /* sort the list of SMgrRelations if necessary */
    4171 GIC           9 :     if (use_bsearch)
    4172 UNC           0 :         pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
    4173                 : 
    4174 ECB             :     /* Make sure we can handle the pin inside the loop */
    4175 CBC           9 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    4176                 : 
    4177 GIC      147465 :     for (i = 0; i < NBuffers; i++)
    4178                 :     {
    4179 CBC      147456 :         SMgrSortArray *srelent = NULL;
    4180 GIC      147456 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    4181                 :         uint32      buf_state;
    4182                 : 
    4183                 :         /*
    4184                 :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    4185                 :          * saves some cycles.
    4186 ECB             :          */
    4187                 : 
    4188 GIC      147456 :         if (!use_bsearch)
    4189 ECB             :         {
    4190                 :             int         j;
    4191                 : 
    4192 CBC      291200 :             for (j = 0; j < nrels; j++)
    4193                 :             {
    4194 GNC      147456 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
    4195 ECB             :                 {
    4196 GIC        3712 :                     srelent = &srels[j];
    4197 CBC        3712 :                     break;
    4198                 :                 }
    4199                 :             }
    4200                 :         }
    4201                 :         else
    4202                 :         {
    4203                 :             RelFileLocator rlocator;
    4204                 : 
    4205 UNC           0 :             rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    4206               0 :             srelent = bsearch((const void *) &(rlocator),
    4207                 :                               srels, nrels, sizeof(SMgrSortArray),
    4208                 :                               rlocator_comparator);
    4209                 :         }
    4210                 : 
    4211                 :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    4212 GIC      147456 :         if (srelent == NULL)
    4213          143744 :             continue;
    4214                 : 
    4215            3712 :         ReservePrivateRefCountEntry();
    4216                 : 
    4217            3712 :         buf_state = LockBufHdr(bufHdr);
    4218 GNC        3712 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
    4219 CBC        3712 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    4220 ECB             :         {
    4221 GIC        3367 :             PinBuffer_Locked(bufHdr);
    4222 CBC        3367 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
    4223 GNC        3367 :             FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4224 CBC        3367 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
    4225 GNC        3367 :             UnpinBuffer(bufHdr);
    4226 ECB             :         }
    4227                 :         else
    4228 CBC         345 :             UnlockBufHdr(bufHdr, buf_state);
    4229                 :     }
    4230 ECB             : 
    4231 CBC           9 :     pfree(srels);
    4232                 : }
    4233                 : 
    4234 ECB             : /* ---------------------------------------------------------------------
    4235                 :  *      RelationCopyStorageUsingBuffer
    4236                 :  *
    4237                 :  *      Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
    4238                 :  *      of using smgrread and smgrextend this will copy using bufmgr APIs.
    4239                 :  *
    4240                 :  *      Refer comments atop CreateAndCopyRelationData() for details about
    4241                 :  *      'permanent' parameter.
    4242                 :  * --------------------------------------------------------------------
    4243                 :  */
    4244                 : static void
    4245 GNC       54479 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
    4246                 :                                RelFileLocator dstlocator,
    4247                 :                                ForkNumber forkNum, bool permanent)
    4248 ECB             : {
    4249                 :     Buffer      srcBuf;
    4250                 :     Buffer      dstBuf;
    4251                 :     Page        srcPage;
    4252                 :     Page        dstPage;
    4253                 :     bool        use_wal;
    4254                 :     BlockNumber nblocks;
    4255                 :     BlockNumber blkno;
    4256                 :     PGIOAlignedBlock buf;
    4257                 :     BufferAccessStrategy bstrategy_src;
    4258                 :     BufferAccessStrategy bstrategy_dst;
    4259                 : 
    4260 EUB             :     /*
    4261                 :      * In general, we want to write WAL whenever wal_level > 'minimal', but we
    4262 ECB             :      * can skip it when copying any fork of an unlogged relation other than
    4263                 :      * the init fork.
    4264                 :      */
    4265 CBC       54479 :     use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
    4266                 : 
    4267 ECB             :     /* Get number of blocks in the source relation. */
    4268 GNC       54479 :     nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
    4269 ECB             :                           forkNum);
    4270                 : 
    4271                 :     /* Nothing to copy; just return. */
    4272 GIC       54479 :     if (nblocks == 0)
    4273 CBC        9382 :         return;
    4274                 : 
    4275                 :     /*
    4276                 :      * Bulk extend the destination relation of the same size as the source
    4277                 :      * relation before starting to copy block by block.
    4278                 :      */
    4279 GIC       45097 :     memset(buf.data, 0, BLCKSZ);
    4280 GNC       45097 :     smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
    4281                 :                buf.data, true);
    4282 ECB             : 
    4283                 :     /* This is a bulk operation, so use buffer access strategies. */
    4284 GIC       45097 :     bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
    4285           45097 :     bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
    4286                 : 
    4287                 :     /* Iterate over each block of the source relation file. */
    4288          217434 :     for (blkno = 0; blkno < nblocks; blkno++)
    4289                 :     {
    4290          172337 :         CHECK_FOR_INTERRUPTS();
    4291 ECB             : 
    4292                 :         /* Read block from source relation. */
    4293 GNC      172337 :         srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
    4294                 :                                            RBM_NORMAL, bstrategy_src,
    4295                 :                                            permanent);
    4296 GIC      172337 :         LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
    4297 CBC      172337 :         srcPage = BufferGetPage(srcBuf);
    4298                 : 
    4299 GNC      172337 :         dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
    4300                 :                                            RBM_ZERO_AND_LOCK, bstrategy_dst,
    4301                 :                                            permanent);
    4302 CBC      172337 :         dstPage = BufferGetPage(dstBuf);
    4303                 : 
    4304 GIC      172337 :         START_CRIT_SECTION();
    4305 ECB             : 
    4306                 :         /* Copy page data from the source to the destination. */
    4307 CBC      172337 :         memcpy(dstPage, srcPage, BLCKSZ);
    4308          172337 :         MarkBufferDirty(dstBuf);
    4309 ECB             : 
    4310                 :         /* WAL-log the copied page. */
    4311 GIC      172337 :         if (use_wal)
    4312          103988 :             log_newpage_buffer(dstBuf, true);
    4313                 : 
    4314 CBC      172337 :         END_CRIT_SECTION();
    4315                 : 
    4316 GIC      172337 :         UnlockReleaseBuffer(dstBuf);
    4317          172337 :         UnlockReleaseBuffer(srcBuf);
    4318                 :     }
    4319                 : 
    4320           45097 :     FreeAccessStrategy(bstrategy_src);
    4321           45097 :     FreeAccessStrategy(bstrategy_dst);
    4322 ECB             : }
    4323                 : 
    4324                 : /* ---------------------------------------------------------------------
    4325                 :  *      CreateAndCopyRelationData
    4326                 :  *
    4327                 :  *      Create destination relation storage and copy all forks from the
    4328                 :  *      source relation to the destination.
    4329                 :  *
    4330                 :  *      Pass permanent as true for permanent relations and false for
    4331                 :  *      unlogged relations.  Currently this API is not supported for
    4332                 :  *      temporary relations.
    4333                 :  * --------------------------------------------------------------------
    4334                 :  */
    4335                 : void
    4336 GNC       40861 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
    4337                 :                           RelFileLocator dst_rlocator, bool permanent)
    4338 ECB             : {
    4339                 :     RelFileLocatorBackend rlocator;
    4340                 :     char        relpersistence;
    4341                 : 
    4342                 :     /* Set the relpersistence. */
    4343 CBC       40861 :     relpersistence = permanent ?
    4344 ECB             :         RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
    4345                 : 
    4346                 :     /*
    4347                 :      * Create and copy all forks of the relation.  During create database we
    4348                 :      * have a separate cleanup mechanism which deletes complete database
    4349                 :      * directory.  Therefore, each individual relation doesn't need to be
    4350                 :      * registered for cleanup.
    4351                 :      */
    4352 GNC       40861 :     RelationCreateStorage(dst_rlocator, relpersistence, false);
    4353                 : 
    4354 ECB             :     /* copy main fork. */
    4355 GNC       40861 :     RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
    4356                 :                                    permanent);
    4357 ECB             : 
    4358                 :     /* copy those extra forks that exist */
    4359 GIC       40861 :     for (ForkNumber forkNum = MAIN_FORKNUM + 1;
    4360 CBC      163444 :          forkNum <= MAX_FORKNUM; forkNum++)
    4361                 :     {
    4362 GNC      122583 :         if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
    4363 ECB             :         {
    4364 GNC       13618 :             smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
    4365                 : 
    4366                 :             /*
    4367                 :              * WAL log creation if the relation is persistent, or this is the
    4368                 :              * init fork of an unlogged relation.
    4369                 :              */
    4370 GIC       13618 :             if (permanent || forkNum == INIT_FORKNUM)
    4371 GNC       13618 :                 log_smgrcreate(&dst_rlocator, forkNum);
    4372                 : 
    4373                 :             /* Copy a fork's data, block by block. */
    4374           13618 :             RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
    4375 ECB             :                                            permanent);
    4376                 :         }
    4377                 :     }
    4378                 : 
    4379                 :     /* close source and destination smgr if exists. */
    4380 GNC       40861 :     rlocator.backend = InvalidBackendId;
    4381                 : 
    4382           40861 :     rlocator.locator = src_rlocator;
    4383           40861 :     smgrcloserellocator(rlocator);
    4384                 : 
    4385           40861 :     rlocator.locator = dst_rlocator;
    4386           40861 :     smgrcloserellocator(rlocator);
    4387 GIC       40861 : }
    4388 ECB             : 
    4389                 : /* ---------------------------------------------------------------------
    4390                 :  *      FlushDatabaseBuffers
    4391                 :  *
    4392                 :  *      This function writes all dirty pages of a database out to disk
    4393                 :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    4394                 :  *      kernel has an up-to-date view of the database.
    4395                 :  *
    4396                 :  *      Generally, the caller should be holding an appropriate lock to ensure
    4397                 :  *      no other backend is active in the target database; otherwise more
    4398                 :  *      pages could get dirtied.
    4399                 :  *
    4400                 :  *      Note we don't worry about flushing any pages of temporary relations.
    4401                 :  *      It's assumed these wouldn't be interesting.
    4402 EUB             :  * --------------------------------------------------------------------
    4403                 :  */
    4404                 : void
    4405 CBC           3 : FlushDatabaseBuffers(Oid dbid)
    4406 ECB             : {
    4407                 :     int         i;
    4408                 :     BufferDesc *bufHdr;
    4409                 : 
    4410                 :     /* Make sure we can handle the pin inside the loop */
    4411 GIC           3 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    4412                 : 
    4413             387 :     for (i = 0; i < NBuffers; i++)
    4414                 :     {
    4415                 :         uint32      buf_state;
    4416                 : 
    4417             384 :         bufHdr = GetBufferDescriptor(i);
    4418                 : 
    4419 ECB             :         /*
    4420                 :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    4421                 :          * saves some cycles.
    4422                 :          */
    4423 GNC         384 :         if (bufHdr->tag.dbOid != dbid)
    4424 GIC         265 :             continue;
    4425 ECB             : 
    4426 GIC         119 :         ReservePrivateRefCountEntry();
    4427                 : 
    4428             119 :         buf_state = LockBufHdr(bufHdr);
    4429 GNC         119 :         if (bufHdr->tag.dbOid == dbid &&
    4430 GIC         119 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    4431                 :         {
    4432               9 :             PinBuffer_Locked(bufHdr);
    4433               9 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
    4434 GNC           9 :             FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4435 CBC           9 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
    4436 GNC           9 :             UnpinBuffer(bufHdr);
    4437                 :         }
    4438 ECB             :         else
    4439 CBC         110 :             UnlockBufHdr(bufHdr, buf_state);
    4440                 :     }
    4441 GIC           3 : }
    4442 ECB             : 
    4443                 : /*
    4444                 :  * Flush a previously, shared or exclusively, locked and pinned buffer to the
    4445                 :  * OS.
    4446                 :  */
    4447                 : void
    4448 GIC          25 : FlushOneBuffer(Buffer buffer)
    4449 ECB             : {
    4450                 :     BufferDesc *bufHdr;
    4451                 : 
    4452                 :     /* currently not needed, but no fundamental reason not to support */
    4453 GIC          25 :     Assert(!BufferIsLocal(buffer));
    4454                 : 
    4455              25 :     Assert(BufferIsPinned(buffer));
    4456                 : 
    4457 CBC          25 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4458                 : 
    4459              25 :     Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
    4460 ECB             : 
    4461 GNC          25 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4462 CBC          25 : }
    4463                 : 
    4464 EUB             : /*
    4465                 :  * ReleaseBuffer -- release the pin on a buffer
    4466 ECB             :  */
    4467                 : void
    4468 GIC    67676152 : ReleaseBuffer(Buffer buffer)
    4469                 : {
    4470        67676152 :     if (!BufferIsValid(buffer))
    4471 UIC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    4472                 : 
    4473 GIC    67676152 :     if (BufferIsLocal(buffer))
    4474 GNC     1502576 :         UnpinLocalBuffer(buffer);
    4475                 :     else
    4476        66173576 :         UnpinBuffer(GetBufferDescriptor(buffer - 1));
    4477 GIC    67676152 : }
    4478                 : 
    4479                 : /*
    4480                 :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
    4481                 :  *
    4482                 :  * This is just a shorthand for a common combination.
    4483 ECB             :  */
    4484                 : void
    4485 CBC    19887237 : UnlockReleaseBuffer(Buffer buffer)
    4486                 : {
    4487 GIC    19887237 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    4488        19887237 :     ReleaseBuffer(buffer);
    4489        19887237 : }
    4490                 : 
    4491                 : /*
    4492 ECB             :  * IncrBufferRefCount
    4493                 :  *      Increment the pin count on a buffer that we have *already* pinned
    4494                 :  *      at least once.
    4495                 :  *
    4496                 :  *      This function cannot be used on a buffer we do not have pinned,
    4497                 :  *      because it doesn't change the shared buffer state.
    4498                 :  */
    4499 EUB             : void
    4500 GIC    11173807 : IncrBufferRefCount(Buffer buffer)
    4501 ECB             : {
    4502 GIC    11173807 :     Assert(BufferIsPinned(buffer));
    4503        11173807 :     ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    4504        11173807 :     if (BufferIsLocal(buffer))
    4505          346867 :         LocalRefCount[-buffer - 1]++;
    4506                 :     else
    4507                 :     {
    4508                 :         PrivateRefCountEntry *ref;
    4509                 : 
    4510        10826940 :         ref = GetPrivateRefCountEntry(buffer, true);
    4511        10826940 :         Assert(ref != NULL);
    4512        10826940 :         ref->refcount++;
    4513                 :     }
    4514        11173807 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
    4515        11173807 : }
    4516                 : 
    4517                 : /*
    4518                 :  * MarkBufferDirtyHint
    4519                 :  *
    4520                 :  *  Mark a buffer dirty for non-critical changes.
    4521                 :  *
    4522                 :  * This is essentially the same as MarkBufferDirty, except:
    4523                 :  *
    4524                 :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
    4525                 :  *    to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
    4526                 :  * 2. The caller might have only share-lock instead of exclusive-lock on the
    4527                 :  *    buffer's content lock.
    4528                 :  * 3. This function does not guarantee that the buffer is always marked dirty
    4529                 :  *    (due to a race condition), so it cannot be used for important changes.
    4530                 :  */
    4531                 : void
    4532        16382378 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
    4533                 : {
    4534                 :     BufferDesc *bufHdr;
    4535        16382378 :     Page        page = BufferGetPage(buffer);
    4536                 : 
    4537        16382378 :     if (!BufferIsValid(buffer))
    4538 UIC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    4539                 : 
    4540 GIC    16382378 :     if (BufferIsLocal(buffer))
    4541                 :     {
    4542          678465 :         MarkLocalBufferDirty(buffer);
    4543          678465 :         return;
    4544                 :     }
    4545                 : 
    4546        15703913 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4547                 : 
    4548        15703913 :     Assert(GetPrivateRefCount(buffer) > 0);
    4549                 :     /* here, either share or exclusive lock is OK */
    4550        15703913 :     Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
    4551                 : 
    4552                 :     /*
    4553                 :      * This routine might get called many times on the same page, if we are
    4554                 :      * making the first scan after commit of an xact that added/deleted many
    4555                 :      * tuples. So, be as quick as we can if the buffer is already dirty.  We
    4556                 :      * do this by not acquiring spinlock if it looks like the status bits are
    4557                 :      * already set.  Since we make this test unlocked, there's a chance we
    4558                 :      * might fail to notice that the flags have just been cleared, and failed
    4559                 :      * to reset them, due to memory-ordering issues.  But since this function
    4560                 :      * is only intended to be used in cases where failing to write out the
    4561                 :      * data would be harmless anyway, it doesn't really matter.
    4562                 :      */
    4563        15703913 :     if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
    4564                 :         (BM_DIRTY | BM_JUST_DIRTIED))
    4565                 :     {
    4566         1716964 :         XLogRecPtr  lsn = InvalidXLogRecPtr;
    4567         1716964 :         bool        dirtied = false;
    4568         1716964 :         bool        delayChkptFlags = false;
    4569                 :         uint32      buf_state;
    4570                 : 
    4571                 :         /*
    4572                 :          * If we need to protect hint bit updates from torn writes, WAL-log a
    4573                 :          * full page image of the page. This full page image is only necessary
    4574                 :          * if the hint bit update is the first change to the page since the
    4575                 :          * last checkpoint.
    4576                 :          *
    4577                 :          * We don't check full_page_writes here because that logic is included
    4578                 :          * when we call XLogInsert() since the value changes dynamically.
    4579                 :          */
    4580 CBC     3384330 :         if (XLogHintBitIsNeeded() &&
    4581 GIC     1667366 :             (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
    4582                 :         {
    4583                 :             /*
    4584                 :              * If we must not write WAL, due to a relfilelocator-specific
    4585 ECB             :              * condition or being in recovery, don't dirty the page.  We can
    4586                 :              * set the hint, just not dirty the page as a result so the hint
    4587                 :              * is lost when we evict the page or shutdown.
    4588                 :              *
    4589                 :              * See src/backend/storage/page/README for longer discussion.
    4590                 :              */
    4591 GIC     1716890 :             if (RecoveryInProgress() ||
    4592 GNC       49527 :                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
    4593 CBC     1617836 :                 return;
    4594 ECB             : 
    4595                 :             /*
    4596                 :              * If the block is already dirty because we either made a change
    4597                 :              * or set a hint already, then we don't need to write a full page
    4598                 :              * image.  Note that aggressive cleaning of blocks dirtied by hint
    4599                 :              * bit setting would increase the call rate. Bulk setting of hint
    4600                 :              * bits would reduce the call rate...
    4601                 :              *
    4602                 :              * We must issue the WAL record before we mark the buffer dirty.
    4603                 :              * Otherwise we might write the page before we write the WAL. That
    4604                 :              * causes a race condition, since a checkpoint might occur between
    4605                 :              * writing the WAL record and marking the buffer dirty. We solve
    4606                 :              * that with a kluge, but one that is already in use during
    4607                 :              * transaction commit to prevent race conditions. Basically, we
    4608                 :              * simply prevent the checkpoint WAL record from being written
    4609                 :              * until we have marked the buffer dirty. We don't start the
    4610                 :              * checkpoint flush until we have marked dirty, so our checkpoint
    4611                 :              * must flush the change to disk successfully or the checkpoint
    4612                 :              * never gets written, so crash recovery will fix.
    4613                 :              *
    4614                 :              * It's possible we may enter here without an xid, so it is
    4615                 :              * essential that CreateCheckPoint waits for virtual transactions
    4616                 :              * rather than full transactionids.
    4617                 :              */
    4618 CBC       49527 :             Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
    4619 GIC       49527 :             MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    4620           49527 :             delayChkptFlags = true;
    4621           49527 :             lsn = XLogSaveBufferForHint(buffer, buffer_std);
    4622 ECB             :         }
    4623                 : 
    4624 GIC       99128 :         buf_state = LockBufHdr(bufHdr);
    4625 ECB             : 
    4626 GIC       99128 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    4627                 : 
    4628 CBC       99128 :         if (!(buf_state & BM_DIRTY))
    4629                 :         {
    4630 GIC       99112 :             dirtied = true;     /* Means "will be dirtied by this action" */
    4631                 : 
    4632 ECB             :             /*
    4633                 :              * Set the page LSN if we wrote a backup block. We aren't supposed
    4634                 :              * to set this when only holding a share lock but as long as we
    4635                 :              * serialise it somehow we're OK. We choose to set LSN while
    4636                 :              * holding the buffer header lock, which causes any reader of an
    4637                 :              * LSN who holds only a share lock to also obtain a buffer header
    4638                 :              * lock before using PageGetLSN(), which is enforced in
    4639                 :              * BufferGetLSNAtomic().
    4640                 :              *
    4641                 :              * If checksums are enabled, you might think we should reset the
    4642                 :              * checksum here. That will happen when the page is written
    4643                 :              * sometime later in this checkpoint cycle.
    4644                 :              */
    4645 GIC       99112 :             if (!XLogRecPtrIsInvalid(lsn))
    4646            5130 :                 PageSetLSN(page, lsn);
    4647                 :         }
    4648 ECB             : 
    4649 CBC       99128 :         buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
    4650 GIC       99128 :         UnlockBufHdr(bufHdr, buf_state);
    4651 ECB             : 
    4652 GIC       99128 :         if (delayChkptFlags)
    4653 CBC       49527 :             MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    4654 ECB             : 
    4655 CBC       99128 :         if (dirtied)
    4656                 :         {
    4657           99112 :             VacuumPageDirty++;
    4658           99112 :             pgBufferUsage.shared_blks_dirtied++;
    4659           99112 :             if (VacuumCostActive)
    4660             445 :                 VacuumCostBalance += VacuumCostPageDirty;
    4661 ECB             :         }
    4662                 :     }
    4663                 : }
    4664                 : 
    4665                 : /*
    4666                 :  * Release buffer content locks for shared buffers.
    4667                 :  *
    4668                 :  * Used to clean up after errors.
    4669                 :  *
    4670                 :  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
    4671                 :  * of releasing buffer content locks per se; the only thing we need to deal
    4672                 :  * with here is clearing any PIN_COUNT request that was in progress.
    4673                 :  */
    4674                 : void
    4675 GIC       37890 : UnlockBuffers(void)
    4676                 : {
    4677           37890 :     BufferDesc *buf = PinCountWaitBuf;
    4678 ECB             : 
    4679 GIC       37890 :     if (buf)
    4680                 :     {
    4681                 :         uint32      buf_state;
    4682                 : 
    4683 UIC           0 :         buf_state = LockBufHdr(buf);
    4684 ECB             : 
    4685 EUB             :         /*
    4686                 :          * Don't complain if flag bit not set; it could have been reset but we
    4687                 :          * got a cancel/die interrupt before getting the signal.
    4688 ECB             :          */
    4689 UIC           0 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    4690 LBC           0 :             buf->wait_backend_pgprocno == MyProc->pgprocno)
    4691 UIC           0 :             buf_state &= ~BM_PIN_COUNT_WAITER;
    4692 ECB             : 
    4693 UIC           0 :         UnlockBufHdr(buf, buf_state);
    4694 ECB             : 
    4695 LBC           0 :         PinCountWaitBuf = NULL;
    4696                 :     }
    4697 GIC       37890 : }
    4698                 : 
    4699                 : /*
    4700                 :  * Acquire or release the content_lock for the buffer.
    4701                 :  */
    4702 ECB             : void
    4703 GIC   196227071 : LockBuffer(Buffer buffer, int mode)
    4704                 : {
    4705 ECB             :     BufferDesc *buf;
    4706 EUB             : 
    4707 GIC   196227071 :     Assert(BufferIsPinned(buffer));
    4708       196227071 :     if (BufferIsLocal(buffer))
    4709 CBC     9816707 :         return;                 /* local buffers need no lock */
    4710                 : 
    4711       186410364 :     buf = GetBufferDescriptor(buffer - 1);
    4712                 : 
    4713       186410364 :     if (mode == BUFFER_LOCK_UNLOCK)
    4714        94045800 :         LWLockRelease(BufferDescriptorGetContentLock(buf));
    4715 GIC    92364564 :     else if (mode == BUFFER_LOCK_SHARE)
    4716        64459683 :         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
    4717        27904881 :     else if (mode == BUFFER_LOCK_EXCLUSIVE)
    4718        27904881 :         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
    4719                 :     else
    4720 UIC           0 :         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
    4721                 : }
    4722 ECB             : 
    4723                 : /*
    4724                 :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
    4725                 :  *
    4726                 :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
    4727                 :  */
    4728                 : bool
    4729 GIC     1220713 : ConditionalLockBuffer(Buffer buffer)
    4730 ECB             : {
    4731                 :     BufferDesc *buf;
    4732                 : 
    4733 GIC     1220713 :     Assert(BufferIsPinned(buffer));
    4734         1220713 :     if (BufferIsLocal(buffer))
    4735           64689 :         return true;            /* act as though we got it */
    4736                 : 
    4737         1156024 :     buf = GetBufferDescriptor(buffer - 1);
    4738                 : 
    4739 GBC     1156024 :     return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
    4740 EUB             :                                     LW_EXCLUSIVE);
    4741                 : }
    4742                 : 
    4743                 : /*
    4744                 :  * Verify that this backend is pinning the buffer exactly once.
    4745                 :  *
    4746                 :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
    4747                 :  * holds a pin on the buffer.  We do not care whether some other backend does.
    4748                 :  */
    4749                 : void
    4750 GNC     3785962 : CheckBufferIsPinnedOnce(Buffer buffer)
    4751                 : {
    4752         3785962 :     if (BufferIsLocal(buffer))
    4753                 :     {
    4754              16 :         if (LocalRefCount[-buffer - 1] != 1)
    4755 UNC           0 :             elog(ERROR, "incorrect local pin count: %d",
    4756                 :                  LocalRefCount[-buffer - 1]);
    4757                 :     }
    4758                 :     else
    4759                 :     {
    4760 GNC     3785946 :         if (GetPrivateRefCount(buffer) != 1)
    4761 UNC           0 :             elog(ERROR, "incorrect local pin count: %d",
    4762                 :                  GetPrivateRefCount(buffer));
    4763                 :     }
    4764 GNC     3785962 : }
    4765                 : 
    4766                 : /*
    4767                 :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
    4768                 :  *
    4769 ECB             :  * Items may be deleted from a disk page only when the caller (a) holds an
    4770                 :  * exclusive lock on the buffer and (b) has observed that no other backend
    4771                 :  * holds a pin on the buffer.  If there is a pin, then the other backend
    4772                 :  * might have a pointer into the buffer (for example, a heapscan reference
    4773                 :  * to an item --- see README for more details).  It's OK if a pin is added
    4774                 :  * after the cleanup starts, however; the newly-arrived backend will be
    4775                 :  * unable to look at the page until we release the exclusive lock.
    4776                 :  *
    4777                 :  * To implement this protocol, a would-be deleter must pin the buffer and
    4778                 :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
    4779                 :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
    4780                 :  * it has successfully observed pin count = 1.
    4781                 :  */
    4782                 : void
    4783 GIC       37384 : LockBufferForCleanup(Buffer buffer)
    4784                 : {
    4785 ECB             :     BufferDesc *bufHdr;
    4786 GIC       37384 :     TimestampTz waitStart = 0;
    4787 GNC       37384 :     bool        waiting = false;
    4788 CBC       37384 :     bool        logged_recovery_conflict = false;
    4789                 : 
    4790 GIC       37384 :     Assert(BufferIsPinned(buffer));
    4791           37384 :     Assert(PinCountWaitBuf == NULL);
    4792                 : 
    4793 GNC       37384 :     CheckBufferIsPinnedOnce(buffer);
    4794 ECB             : 
    4795                 :     /* Nobody else to wait for */
    4796 GNC       37384 :     if (BufferIsLocal(buffer))
    4797              16 :         return;
    4798                 : 
    4799 GIC       37368 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4800                 : 
    4801                 :     for (;;)
    4802              11 :     {
    4803                 :         uint32      buf_state;
    4804                 : 
    4805                 :         /* Try to acquire lock */
    4806           37379 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    4807           37379 :         buf_state = LockBufHdr(bufHdr);
    4808                 : 
    4809           37379 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    4810           37379 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    4811                 :         {
    4812                 :             /* Successfully acquired exclusive lock with pincount 1 */
    4813 CBC       37368 :             UnlockBufHdr(bufHdr, buf_state);
    4814                 : 
    4815                 :             /*
    4816 ECB             :              * Emit the log message if recovery conflict on buffer pin was
    4817                 :              * resolved but the startup process waited longer than
    4818                 :              * deadlock_timeout for it.
    4819                 :              */
    4820 CBC       37368 :             if (logged_recovery_conflict)
    4821               2 :                 LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
    4822                 :                                     waitStart, GetCurrentTimestamp(),
    4823                 :                                     NULL, false);
    4824                 : 
    4825 GNC       37368 :             if (waiting)
    4826 ECB             :             {
    4827                 :                 /* reset ps display to remove the suffix if we added one */
    4828 GNC           2 :                 set_ps_display_remove_suffix();
    4829               2 :                 waiting = false;
    4830                 :             }
    4831 GIC       37368 :             return;
    4832 ECB             :         }
    4833                 :         /* Failed, so mark myself as waiting for pincount 1 */
    4834 GIC          11 :         if (buf_state & BM_PIN_COUNT_WAITER)
    4835                 :         {
    4836 LBC           0 :             UnlockBufHdr(bufHdr, buf_state);
    4837 UIC           0 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    4838 LBC           0 :             elog(ERROR, "multiple backends attempting to wait for pincount 1");
    4839                 :         }
    4840 GIC          11 :         bufHdr->wait_backend_pgprocno = MyProc->pgprocno;
    4841 CBC          11 :         PinCountWaitBuf = bufHdr;
    4842 GIC          11 :         buf_state |= BM_PIN_COUNT_WAITER;
    4843              11 :         UnlockBufHdr(bufHdr, buf_state);
    4844 CBC          11 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    4845 ECB             : 
    4846                 :         /* Wait to be signaled by UnpinBuffer() */
    4847 CBC          11 :         if (InHotStandby)
    4848                 :         {
    4849 GNC          11 :             if (!waiting)
    4850                 :             {
    4851                 :                 /* adjust the process title to indicate that it's waiting */
    4852               2 :                 set_ps_display_suffix("waiting");
    4853               2 :                 waiting = true;
    4854                 :             }
    4855 ECB             : 
    4856                 :             /*
    4857                 :              * Emit the log message if the startup process is waiting longer
    4858                 :              * than deadlock_timeout for recovery conflict on buffer pin.
    4859                 :              *
    4860                 :              * Skip this if first time through because the startup process has
    4861                 :              * not started waiting yet in this case. So, the wait start
    4862                 :              * timestamp is set after this logic.
    4863                 :              */
    4864 GIC          11 :             if (waitStart != 0 && !logged_recovery_conflict)
    4865                 :             {
    4866               4 :                 TimestampTz now = GetCurrentTimestamp();
    4867                 : 
    4868               4 :                 if (TimestampDifferenceExceeds(waitStart, now,
    4869                 :                                                DeadlockTimeout))
    4870                 :                 {
    4871               2 :                     LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
    4872                 :                                         waitStart, now, NULL, true);
    4873               2 :                     logged_recovery_conflict = true;
    4874                 :                 }
    4875                 :             }
    4876                 : 
    4877 ECB             :             /*
    4878                 :              * Set the wait start timestamp if logging is enabled and first
    4879                 :              * time through.
    4880                 :              */
    4881 GIC          11 :             if (log_recovery_conflict_waits && waitStart == 0)
    4882               2 :                 waitStart = GetCurrentTimestamp();
    4883                 : 
    4884 ECB             :             /* Publish the bufid that Startup process waits on */
    4885 GIC          11 :             SetStartupBufferPinWaitBufId(buffer - 1);
    4886                 :             /* Set alarm and then wait to be signaled by UnpinBuffer() */
    4887              11 :             ResolveRecoveryConflictWithBufferPin();
    4888                 :             /* Reset the published bufid */
    4889              11 :             SetStartupBufferPinWaitBufId(-1);
    4890                 :         }
    4891                 :         else
    4892 UIC           0 :             ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
    4893 ECB             : 
    4894                 :         /*
    4895                 :          * Remove flag marking us as waiter. Normally this will not be set
    4896                 :          * anymore, but ProcWaitForSignal() can return for other signals as
    4897                 :          * well.  We take care to only reset the flag if we're the waiter, as
    4898                 :          * theoretically another backend could have started waiting. That's
    4899                 :          * impossible with the current usages due to table level locking, but
    4900                 :          * better be safe.
    4901                 :          */
    4902 GIC          11 :         buf_state = LockBufHdr(bufHdr);
    4903 CBC          11 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    4904 GIC           9 :             bufHdr->wait_backend_pgprocno == MyProc->pgprocno)
    4905 CBC           9 :             buf_state &= ~BM_PIN_COUNT_WAITER;
    4906 GIC          11 :         UnlockBufHdr(bufHdr, buf_state);
    4907                 : 
    4908              11 :         PinCountWaitBuf = NULL;
    4909                 :         /* Loop back and try again */
    4910                 :     }
    4911 ECB             : }
    4912                 : 
    4913                 : /*
    4914                 :  * Check called from RecoveryConflictInterrupt handler when Startup
    4915                 :  * process requests cancellation of all pin holders that are blocking it.
    4916                 :  */
    4917                 : bool
    4918 GIC           4 : HoldingBufferPinThatDelaysRecovery(void)
    4919                 : {
    4920               4 :     int         bufid = GetStartupBufferPinWaitBufId();
    4921 ECB             : 
    4922                 :     /*
    4923                 :      * If we get woken slowly then it's possible that the Startup process was
    4924                 :      * already woken by other backends before we got here. Also possible that
    4925                 :      * we get here by multiple interrupts or interrupts at inappropriate
    4926                 :      * times, so make sure we do nothing if the bufid is not set.
    4927                 :      */
    4928 CBC           4 :     if (bufid < 0)
    4929 GIC           2 :         return false;
    4930                 : 
    4931               2 :     if (GetPrivateRefCount(bufid + 1) > 0)
    4932               2 :         return true;
    4933                 : 
    4934 UIC           0 :     return false;
    4935                 : }
    4936                 : 
    4937                 : /*
    4938                 :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
    4939                 :  *
    4940                 :  * We won't loop, but just check once to see if the pin count is OK.  If
    4941                 :  * not, return false with no lock held.
    4942                 :  */
    4943                 : bool
    4944 GIC      270333 : ConditionalLockBufferForCleanup(Buffer buffer)
    4945                 : {
    4946 ECB             :     BufferDesc *bufHdr;
    4947                 :     uint32      buf_state,
    4948                 :                 refcount;
    4949                 : 
    4950 GIC      270333 :     Assert(BufferIsValid(buffer));
    4951                 : 
    4952 CBC      270333 :     if (BufferIsLocal(buffer))
    4953                 :     {
    4954             782 :         refcount = LocalRefCount[-buffer - 1];
    4955                 :         /* There should be exactly one pin */
    4956 GIC         782 :         Assert(refcount > 0);
    4957             782 :         if (refcount != 1)
    4958 CBC          21 :             return false;
    4959                 :         /* Nobody else to wait for */
    4960 GIC         761 :         return true;
    4961                 :     }
    4962                 : 
    4963                 :     /* There should be exactly one local pin */
    4964 CBC      269551 :     refcount = GetPrivateRefCount(buffer);
    4965          269551 :     Assert(refcount);
    4966 GIC      269551 :     if (refcount != 1)
    4967 CBC         210 :         return false;
    4968                 : 
    4969 ECB             :     /* Try to acquire lock */
    4970 CBC      269341 :     if (!ConditionalLockBuffer(buffer))
    4971              28 :         return false;
    4972                 : 
    4973          269313 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4974          269313 :     buf_state = LockBufHdr(bufHdr);
    4975          269313 :     refcount = BUF_STATE_GET_REFCOUNT(buf_state);
    4976 ECB             : 
    4977 CBC      269313 :     Assert(refcount > 0);
    4978 GIC      269313 :     if (refcount == 1)
    4979                 :     {
    4980 ECB             :         /* Successfully acquired exclusive lock with pincount 1 */
    4981 GIC      269275 :         UnlockBufHdr(bufHdr, buf_state);
    4982 CBC      269275 :         return true;
    4983                 :     }
    4984                 : 
    4985                 :     /* Failed, so release the lock */
    4986 GIC          38 :     UnlockBufHdr(bufHdr, buf_state);
    4987              38 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    4988              38 :     return false;
    4989 ECB             : }
    4990                 : 
    4991                 : /*
    4992                 :  * IsBufferCleanupOK - as above, but we already have the lock
    4993                 :  *
    4994                 :  * Check whether it's OK to perform cleanup on a buffer we've already
    4995                 :  * locked.  If we observe that the pin count is 1, our exclusive lock
    4996                 :  * happens to be a cleanup lock, and we can proceed with anything that
    4997                 :  * would have been allowable had we sought a cleanup lock originally.
    4998                 :  */
    4999                 : bool
    5000 CBC        2016 : IsBufferCleanupOK(Buffer buffer)
    5001                 : {
    5002 ECB             :     BufferDesc *bufHdr;
    5003                 :     uint32      buf_state;
    5004                 : 
    5005 GIC        2016 :     Assert(BufferIsValid(buffer));
    5006                 : 
    5007            2016 :     if (BufferIsLocal(buffer))
    5008                 :     {
    5009 ECB             :         /* There should be exactly one pin */
    5010 UIC           0 :         if (LocalRefCount[-buffer - 1] != 1)
    5011 LBC           0 :             return false;
    5012 EUB             :         /* Nobody else to wait for */
    5013 UIC           0 :         return true;
    5014 ECB             :     }
    5015                 : 
    5016                 :     /* There should be exactly one local pin */
    5017 CBC        2016 :     if (GetPrivateRefCount(buffer) != 1)
    5018 LBC           0 :         return false;
    5019                 : 
    5020 GIC        2016 :     bufHdr = GetBufferDescriptor(buffer - 1);
    5021                 : 
    5022                 :     /* caller must hold exclusive lock on buffer */
    5023            2016 :     Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
    5024                 :                                 LW_EXCLUSIVE));
    5025                 : 
    5026 CBC        2016 :     buf_state = LockBufHdr(bufHdr);
    5027                 : 
    5028            2016 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    5029            2016 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    5030 ECB             :     {
    5031                 :         /* pincount is OK. */
    5032 GIC        2016 :         UnlockBufHdr(bufHdr, buf_state);
    5033            2016 :         return true;
    5034                 :     }
    5035                 : 
    5036 UIC           0 :     UnlockBufHdr(bufHdr, buf_state);
    5037               0 :     return false;
    5038                 : }
    5039                 : 
    5040                 : 
    5041 ECB             : /*
    5042                 :  *  Functions for buffer I/O handling
    5043                 :  *
    5044                 :  *  Note: We assume that nested buffer I/O never occurs.
    5045                 :  *  i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
    5046                 :  *
    5047                 :  *  Also note that these are used only for shared buffers, not local ones.
    5048                 :  */
    5049                 : 
    5050                 : /*
    5051                 :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
    5052                 :  */
    5053                 : static void
    5054 GIC        1696 : WaitIO(BufferDesc *buf)
    5055 ECB             : {
    5056 CBC        1696 :     ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
    5057                 : 
    5058 GIC        1696 :     ConditionVariablePrepareToSleep(cv);
    5059                 :     for (;;)
    5060            1623 :     {
    5061                 :         uint32      buf_state;
    5062                 : 
    5063                 :         /*
    5064                 :          * It may not be necessary to acquire the spinlock to check the flag
    5065                 :          * here, but since this test is essential for correctness, we'd better
    5066                 :          * play it safe.
    5067                 :          */
    5068            3319 :         buf_state = LockBufHdr(buf);
    5069            3319 :         UnlockBufHdr(buf, buf_state);
    5070                 : 
    5071            3319 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    5072            1696 :             break;
    5073 CBC        1623 :         ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
    5074                 :     }
    5075 GIC        1696 :     ConditionVariableCancelSleep();
    5076 CBC        1696 : }
    5077                 : 
    5078 ECB             : /*
    5079 EUB             :  * StartBufferIO: begin I/O on this buffer
    5080                 :  *  (Assumptions)
    5081 ECB             :  *  My process is executing no IO
    5082                 :  *  The buffer is Pinned
    5083                 :  *
    5084                 :  * In some scenarios there are race conditions in which multiple backends
    5085                 :  * could attempt the same I/O operation concurrently.  If someone else
    5086                 :  * has already started I/O on this buffer then we will block on the
    5087                 :  * I/O condition variable until he's done.
    5088                 :  *
    5089                 :  * Input operations are only attempted on buffers that are not BM_VALID,
    5090                 :  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
    5091                 :  * so we can always tell if the work is already done.
    5092                 :  *
    5093                 :  * Returns true if we successfully marked the buffer as I/O busy,
    5094                 :  * false if someone else already did the work.
    5095                 :  */
    5096                 : static bool
    5097 GIC     2596864 : StartBufferIO(BufferDesc *buf, bool forInput)
    5098                 : {
    5099                 :     uint32      buf_state;
    5100                 : 
    5101 GNC     2596864 :     ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner);
    5102                 : 
    5103                 :     for (;;)
    5104 ECB             :     {
    5105 GIC     2598558 :         buf_state = LockBufHdr(buf);
    5106                 : 
    5107 CBC     2598558 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    5108         2596864 :             break;
    5109            1694 :         UnlockBufHdr(buf, buf_state);
    5110 GIC        1694 :         WaitIO(buf);
    5111                 :     }
    5112                 : 
    5113                 :     /* Once we get here, there is definitely no I/O active on this buffer */
    5114                 : 
    5115         2596864 :     if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
    5116                 :     {
    5117                 :         /* someone else already did the I/O */
    5118            1768 :         UnlockBufHdr(buf, buf_state);
    5119            1768 :         return false;
    5120                 :     }
    5121 ECB             : 
    5122 CBC     2595096 :     buf_state |= BM_IO_IN_PROGRESS;
    5123 GIC     2595096 :     UnlockBufHdr(buf, buf_state);
    5124                 : 
    5125 GNC     2595096 :     ResourceOwnerRememberBufferIO(CurrentResourceOwner,
    5126                 :                                   BufferDescriptorGetBuffer(buf));
    5127                 : 
    5128 GIC     2595096 :     return true;
    5129                 : }
    5130                 : 
    5131                 : /*
    5132 ECB             :  * TerminateBufferIO: release a buffer we were doing I/O on
    5133                 :  *  (Assumptions)
    5134                 :  *  My process is executing IO for the buffer
    5135                 :  *  BM_IO_IN_PROGRESS bit is set for the buffer
    5136                 :  *  The buffer is Pinned
    5137                 :  *
    5138                 :  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
    5139                 :  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
    5140                 :  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
    5141                 :  * marking the buffer clean if it was re-dirtied while we were writing.
    5142                 :  *
    5143                 :  * set_flag_bits gets ORed into the buffer's flags.  It must include
    5144                 :  * BM_IO_ERROR in a failure case.  For successful completion it could
    5145                 :  * be 0, or BM_VALID if we just finished reading in the page.
    5146                 :  */
    5147                 : static void
    5148 GIC     2595096 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
    5149                 : {
    5150                 :     uint32      buf_state;
    5151                 : 
    5152         2595096 :     buf_state = LockBufHdr(buf);
    5153                 : 
    5154         2595096 :     Assert(buf_state & BM_IO_IN_PROGRESS);
    5155                 : 
    5156         2595096 :     buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
    5157 CBC     2595096 :     if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
    5158          725804 :         buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
    5159 ECB             : 
    5160 CBC     2595096 :     buf_state |= set_flag_bits;
    5161 GIC     2595096 :     UnlockBufHdr(buf, buf_state);
    5162                 : 
    5163 GNC     2595096 :     ResourceOwnerForgetBufferIO(CurrentResourceOwner,
    5164                 :                                 BufferDescriptorGetBuffer(buf));
    5165                 : 
    5166 CBC     2595096 :     ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
    5167 GIC     2595096 : }
    5168 ECB             : 
    5169                 : /*
    5170                 :  * AbortBufferIO: Clean up active buffer I/O after an error.
    5171                 :  *
    5172                 :  *  All LWLocks we might have held have been released,
    5173                 :  *  but we haven't yet released buffer pins, so the buffer is still pinned.
    5174                 :  *
    5175                 :  *  If I/O was in progress, we always set BM_IO_ERROR, even though it's
    5176                 :  *  possible the error condition wasn't related to the I/O.
    5177                 :  */
    5178                 : void
    5179 GNC          13 : AbortBufferIO(Buffer buf)
    5180                 : {
    5181              13 :     BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
    5182                 :     uint32      buf_state;
    5183                 : 
    5184              13 :     buf_state = LockBufHdr(buf_hdr);
    5185              13 :     Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
    5186                 : 
    5187              13 :     if (!(buf_state & BM_VALID))
    5188                 :     {
    5189              13 :         Assert(!(buf_state & BM_DIRTY));
    5190              13 :         UnlockBufHdr(buf_hdr, buf_state);
    5191 ECB             :     }
    5192                 :     else
    5193                 :     {
    5194 UNC           0 :         Assert(buf_state & BM_DIRTY);
    5195               0 :         UnlockBufHdr(buf_hdr, buf_state);
    5196                 : 
    5197                 :         /* Issue notice if this is not the first failure... */
    5198               0 :         if (buf_state & BM_IO_ERROR)
    5199                 :         {
    5200                 :             /* Buffer is pinned, so we can read tag without spinlock */
    5201                 :             char       *path;
    5202                 : 
    5203               0 :             path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
    5204                 :                                BufTagGetForkNum(&buf_hdr->tag));
    5205               0 :             ereport(WARNING,
    5206                 :                     (errcode(ERRCODE_IO_ERROR),
    5207                 :                      errmsg("could not write block %u of %s",
    5208                 :                             buf_hdr->tag.blockNum, path),
    5209                 :                      errdetail("Multiple failures --- write error might be permanent.")));
    5210               0 :             pfree(path);
    5211                 :         }
    5212                 :     }
    5213                 : 
    5214 GNC          13 :     TerminateBufferIO(buf_hdr, false, BM_IO_ERROR);
    5215 GIC          13 : }
    5216 ECB             : 
    5217                 : /*
    5218                 :  * Error context callback for errors occurring during shared buffer writes.
    5219                 :  */
    5220 EUB             : static void
    5221 GIC          73 : shared_buffer_write_error_callback(void *arg)
    5222                 : {
    5223              73 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    5224                 : 
    5225                 :     /* Buffer is pinned, so we can read the tag without locking the spinlock */
    5226 GBC          73 :     if (bufHdr != NULL)
    5227 EUB             :     {
    5228 GNC          73 :         char       *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
    5229                 :                                        BufTagGetForkNum(&bufHdr->tag));
    5230                 : 
    5231 GBC          73 :         errcontext("writing block %u of relation %s",
    5232                 :                    bufHdr->tag.blockNum, path);
    5233              73 :         pfree(path);
    5234                 :     }
    5235 CBC          73 : }
    5236                 : 
    5237                 : /*
    5238                 :  * Error context callback for errors occurring during local buffer writes.
    5239                 :  */
    5240                 : static void
    5241 LBC           0 : local_buffer_write_error_callback(void *arg)
    5242                 : {
    5243 UIC           0 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    5244                 : 
    5245 LBC           0 :     if (bufHdr != NULL)
    5246 ECB             :     {
    5247 UNC           0 :         char       *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
    5248                 :                                           MyBackendId,
    5249                 :                                           BufTagGetForkNum(&bufHdr->tag));
    5250 ECB             : 
    5251 UIC           0 :         errcontext("writing block %u of relation %s",
    5252 ECB             :                    bufHdr->tag.blockNum, path);
    5253 LBC           0 :         pfree(path);
    5254 ECB             :     }
    5255 LBC           0 : }
    5256 ECB             : 
    5257                 : /*
    5258                 :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
    5259 EUB             :  */
    5260                 : static int
    5261 GNC    10899862 : rlocator_comparator(const void *p1, const void *p2)
    5262                 : {
    5263        10899862 :     RelFileLocator n1 = *(const RelFileLocator *) p1;
    5264        10899862 :     RelFileLocator n2 = *(const RelFileLocator *) p2;
    5265                 : 
    5266        10899862 :     if (n1.relNumber < n2.relNumber)
    5267 GIC     9145773 :         return -1;
    5268 GNC     1754089 :     else if (n1.relNumber > n2.relNumber)
    5269 GIC      333597 :         return 1;
    5270                 : 
    5271 GNC     1420492 :     if (n1.dbOid < n2.dbOid)
    5272 CBC       37470 :         return -1;
    5273 GNC     1383022 :     else if (n1.dbOid > n2.dbOid)
    5274 CBC       45334 :         return 1;
    5275                 : 
    5276 GNC     1337688 :     if (n1.spcOid < n2.spcOid)
    5277 UIC           0 :         return -1;
    5278 GNC     1337688 :     else if (n1.spcOid > n2.spcOid)
    5279 UIC           0 :         return 1;
    5280                 :     else
    5281 GIC     1337688 :         return 0;
    5282                 : }
    5283                 : 
    5284                 : /*
    5285                 :  * Lock buffer header - set BM_LOCKED in buffer state.
    5286                 :  */
    5287                 : uint32
    5288        51777935 : LockBufHdr(BufferDesc *desc)
    5289 ECB             : {
    5290                 :     SpinDelayStatus delayStatus;
    5291                 :     uint32      old_buf_state;
    5292                 : 
    5293 GNC    51777935 :     Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
    5294                 : 
    5295 CBC    51777935 :     init_local_spin_delay(&delayStatus);
    5296 EUB             : 
    5297                 :     while (true)
    5298                 :     {
    5299                 :         /* set BM_LOCKED flag */
    5300 GIC    51792397 :         old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
    5301 ECB             :         /* if it wasn't set before we're OK */
    5302 GBC    51792397 :         if (!(old_buf_state & BM_LOCKED))
    5303 GIC    51777935 :             break;
    5304           14462 :         perform_spin_delay(&delayStatus);
    5305 ECB             :     }
    5306 GIC    51777935 :     finish_spin_delay(&delayStatus);
    5307        51777935 :     return old_buf_state | BM_LOCKED;
    5308                 : }
    5309                 : 
    5310                 : /*
    5311                 :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
    5312                 :  * state at that point.
    5313                 :  *
    5314                 :  * Obviously the buffer could be locked by the time the value is returned, so
    5315                 :  * this is primarily useful in CAS style loops.
    5316                 :  */
    5317                 : static uint32
    5318            1033 : WaitBufHdrUnlocked(BufferDesc *buf)
    5319                 : {
    5320                 :     SpinDelayStatus delayStatus;
    5321                 :     uint32      buf_state;
    5322                 : 
    5323            1033 :     init_local_spin_delay(&delayStatus);
    5324 ECB             : 
    5325 GIC        1033 :     buf_state = pg_atomic_read_u32(&buf->state);
    5326                 : 
    5327 CBC        7839 :     while (buf_state & BM_LOCKED)
    5328 ECB             :     {
    5329 CBC        6806 :         perform_spin_delay(&delayStatus);
    5330 GIC        6806 :         buf_state = pg_atomic_read_u32(&buf->state);
    5331 ECB             :     }
    5332                 : 
    5333 GIC        1033 :     finish_spin_delay(&delayStatus);
    5334 ECB             : 
    5335 GIC        1033 :     return buf_state;
    5336                 : }
    5337 ECB             : 
    5338                 : /*
    5339                 :  * BufferTag comparator.
    5340                 :  */
    5341                 : static inline int
    5342 GIC     2012103 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
    5343 ECB             : {
    5344                 :     int         ret;
    5345                 :     RelFileLocator rlocatora;
    5346                 :     RelFileLocator rlocatorb;
    5347                 : 
    5348 GNC     2012103 :     rlocatora = BufTagGetRelFileLocator(ba);
    5349         2012103 :     rlocatorb = BufTagGetRelFileLocator(bb);
    5350                 : 
    5351         2012103 :     ret = rlocator_comparator(&rlocatora, &rlocatorb);
    5352 ECB             : 
    5353 CBC     2012103 :     if (ret != 0)
    5354 GIC      676080 :         return ret;
    5355 ECB             : 
    5356 GNC     1336023 :     if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
    5357 GIC      104693 :         return -1;
    5358 GNC     1231330 :     if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
    5359 CBC       65622 :         return 1;
    5360                 : 
    5361 GIC     1165708 :     if (ba->blockNum < bb->blockNum)
    5362          767765 :         return -1;
    5363          397943 :     if (ba->blockNum > bb->blockNum)
    5364          397318 :         return 1;
    5365                 : 
    5366 CBC         625 :     return 0;
    5367 ECB             : }
    5368                 : 
    5369                 : /*
    5370                 :  * Comparator determining the writeout order in a checkpoint.
    5371                 :  *
    5372                 :  * It is important that tablespaces are compared first, the logic balancing
    5373                 :  * writes between tablespaces relies on it.
    5374                 :  */
    5375                 : static inline int
    5376 GIC     4878998 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
    5377 ECB             : {
    5378                 :     /* compare tablespace */
    5379 GIC     4878998 :     if (a->tsId < b->tsId)
    5380 CBC       17450 :         return -1;
    5381 GIC     4861548 :     else if (a->tsId > b->tsId)
    5382 GBC       71952 :         return 1;
    5383 EUB             :     /* compare relation */
    5384 GNC     4789596 :     if (a->relNumber < b->relNumber)
    5385 GIC     1323526 :         return -1;
    5386 GNC     3466070 :     else if (a->relNumber > b->relNumber)
    5387 CBC     1212862 :         return 1;
    5388 ECB             :     /* compare fork */
    5389 CBC     2253208 :     else if (a->forkNum < b->forkNum)
    5390          115236 :         return -1;
    5391 GIC     2137972 :     else if (a->forkNum > b->forkNum)
    5392          112381 :         return 1;
    5393 ECB             :     /* compare block number */
    5394 GIC     2025591 :     else if (a->blockNum < b->blockNum)
    5395 CBC     1018511 :         return -1;
    5396 GIC     1007080 :     else if (a->blockNum > b->blockNum)
    5397          975421 :         return 1;
    5398 ECB             :     /* equal page IDs are unlikely, but not impossible */
    5399 CBC       31659 :     return 0;
    5400                 : }
    5401                 : 
    5402                 : /*
    5403                 :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
    5404                 :  * progress.
    5405                 :  */
    5406                 : static int
    5407 GIC      426318 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
    5408                 : {
    5409          426318 :     CkptTsStatus *sa = (CkptTsStatus *) a;
    5410 CBC      426318 :     CkptTsStatus *sb = (CkptTsStatus *) b;
    5411                 : 
    5412 ECB             :     /* we want a min-heap, so return 1 for the a < b */
    5413 GIC      426318 :     if (sa->progress < sb->progress)
    5414 CBC      398630 :         return 1;
    5415 GIC       27688 :     else if (sa->progress == sb->progress)
    5416            1458 :         return 0;
    5417 ECB             :     else
    5418 GIC       26230 :         return -1;
    5419 ECB             : }
    5420                 : 
    5421                 : /*
    5422                 :  * Initialize a writeback context, discarding potential previous state.
    5423                 :  *
    5424                 :  * *max_pending is a pointer instead of an immediate value, so the coalesce
    5425                 :  * limits can easily changed by the GUC mechanism, and so calling code does
    5426                 :  * not have to check the current configuration. A value of 0 means that no
    5427                 :  * writeback control will be performed.
    5428                 :  */
    5429                 : void
    5430 GIC        3777 : WritebackContextInit(WritebackContext *context, int *max_pending)
    5431 ECB             : {
    5432 GIC        3777 :     Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    5433 ECB             : 
    5434 GIC        3777 :     context->max_pending = max_pending;
    5435 CBC        3777 :     context->nr_pending = 0;
    5436 GIC        3777 : }
    5437                 : 
    5438 EUB             : /*
    5439                 :  * Add buffer to list of pending writeback requests.
    5440                 :  */
    5441                 : void
    5442 GIC      722286 : ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
    5443                 : {
    5444                 :     PendingWriteback *pending;
    5445                 : 
    5446 GNC      722286 :     if (io_direct_flags & IO_DIRECT_DATA)
    5447             534 :         return;
    5448                 : 
    5449                 :     /*
    5450                 :      * Add buffer to the pending writeback array, unless writeback control is
    5451 ECB             :      * disabled.
    5452                 :      */
    5453 CBC      721752 :     if (*context->max_pending > 0)
    5454 ECB             :     {
    5455 CBC      488257 :         Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    5456                 : 
    5457          488257 :         pending = &context->pending_writebacks[context->nr_pending++];
    5458                 : 
    5459 GIC      488257 :         pending->tag = *tag;
    5460                 :     }
    5461                 : 
    5462                 :     /*
    5463                 :      * Perform pending flushes if the writeback limit is exceeded. This
    5464                 :      * includes the case where previously an item has been added, but control
    5465                 :      * is now disabled.
    5466                 :      */
    5467 CBC      721752 :     if (context->nr_pending >= *context->max_pending)
    5468 GIC      247995 :         IssuePendingWritebacks(context);
    5469 ECB             : }
    5470                 : 
    5471                 : #define ST_SORT sort_pending_writebacks
    5472                 : #define ST_ELEMENT_TYPE PendingWriteback
    5473                 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
    5474                 : #define ST_SCOPE static
    5475                 : #define ST_DEFINE
    5476                 : #include <lib/sort_template.h>
    5477                 : 
    5478                 : /*
    5479                 :  * Issue all pending writeback requests, previously scheduled with
    5480                 :  * ScheduleBufferTagForWriteback, to the OS.
    5481                 :  *
    5482                 :  * Because this is only used to improve the OSs IO scheduling we try to never
    5483 EUB             :  * error out - it's just a hint.
    5484                 :  */
    5485                 : void
    5486 GIC      249591 : IssuePendingWritebacks(WritebackContext *context)
    5487                 : {
    5488                 :     int         i;
    5489                 : 
    5490          249591 :     if (context->nr_pending == 0)
    5491          233806 :         return;
    5492                 : 
    5493 ECB             :     /*
    5494                 :      * Executing the writes in-order can make them a lot faster, and allows to
    5495                 :      * merge writeback requests to consecutive blocks into larger writebacks.
    5496                 :      */
    5497 GIC       15785 :     sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
    5498                 : 
    5499 ECB             :     /*
    5500                 :      * Coalesce neighbouring writes, but nothing else. For that we iterate
    5501                 :      * through the, now sorted, array of pending flushes, and look forward to
    5502                 :      * find all neighbouring (or identical) writes.
    5503                 :      */
    5504 GIC      150084 :     for (i = 0; i < context->nr_pending; i++)
    5505 ECB             :     {
    5506                 :         PendingWriteback *cur;
    5507                 :         PendingWriteback *next;
    5508                 :         SMgrRelation reln;
    5509                 :         int         ahead;
    5510                 :         BufferTag   tag;
    5511                 :         RelFileLocator currlocator;
    5512 GIC      134299 :         Size        nblocks = 1;
    5513                 : 
    5514 CBC      134299 :         cur = &context->pending_writebacks[i];
    5515          134299 :         tag = cur->tag;
    5516 GNC      134299 :         currlocator = BufTagGetRelFileLocator(&tag);
    5517 ECB             : 
    5518                 :         /*
    5519                 :          * Peek ahead, into following writeback requests, to see if they can
    5520                 :          * be combined with the current one.
    5521                 :          */
    5522 CBC      485991 :         for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
    5523                 :         {
    5524                 : 
    5525          470206 :             next = &context->pending_writebacks[i + ahead + 1];
    5526 ECB             : 
    5527                 :             /* different file, stop */
    5528 GNC      470206 :             if (!RelFileLocatorEquals(currlocator,
    5529          394654 :                                       BufTagGetRelFileLocator(&next->tag)) ||
    5530          394654 :                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
    5531 ECB             :                 break;
    5532                 : 
    5533                 :             /* ok, block queued twice, skip */
    5534 CBC      357539 :             if (cur->tag.blockNum == next->tag.blockNum)
    5535             550 :                 continue;
    5536                 : 
    5537                 :             /* only merge consecutive writes */
    5538 GIC      356989 :             if (cur->tag.blockNum + 1 != next->tag.blockNum)
    5539 CBC        5847 :                 break;
    5540 ECB             : 
    5541 CBC      351142 :             nblocks++;
    5542 GIC      351142 :             cur = next;
    5543                 :         }
    5544                 : 
    5545          134299 :         i += ahead;
    5546                 : 
    5547                 :         /* and finally tell the kernel to write the data to storage */
    5548 GNC      134299 :         reln = smgropen(currlocator, InvalidBackendId);
    5549          134299 :         smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
    5550                 :     }
    5551                 : 
    5552 GIC       15785 :     context->nr_pending = 0;
    5553 ECB             : }
    5554                 : 
    5555                 : 
    5556                 : /*
    5557                 :  * Implement slower/larger portions of TestForOldSnapshot
    5558                 :  *
    5559                 :  * Smaller/faster portions are put inline, but the entire set of logic is too
    5560                 :  * big for that.
    5561                 :  */
    5562                 : void
    5563 GBC         651 : TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
    5564 EUB             : {
    5565 GIC         651 :     if (RelationAllowsEarlyPruning(relation)
    5566 GBC         651 :         && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
    5567 GIC           3 :         ereport(ERROR,
    5568                 :                 (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
    5569                 :                  errmsg("snapshot too old")));
    5570 CBC         648 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a