LCOV - differential code coverage report
Current view: top level - src/backend/access/transam - xlog.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 88.4 % 2395 2116 28 63 167 21 51 1392 120 553 187 1428 20 72
Current Date: 2023-04-08 15:15:32 Functions: 99.1 % 116 115 1 106 9 112 1 3
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * xlog.c
       4                 :  *      PostgreSQL write-ahead log manager
       5                 :  *
       6                 :  * The Write-Ahead Log (WAL) functionality is split into several source
       7                 :  * files, in addition to this one:
       8                 :  *
       9                 :  * xloginsert.c - Functions for constructing WAL records
      10                 :  * xlogrecovery.c - WAL recovery and standby code
      11                 :  * xlogreader.c - Facility for reading WAL files and parsing WAL records
      12                 :  * xlogutils.c - Helper functions for WAL redo routines
      13                 :  *
      14                 :  * This file contains functions for coordinating database startup and
      15                 :  * checkpointing, and managing the write-ahead log buffers when the
      16                 :  * system is running.
      17                 :  *
      18                 :  * StartupXLOG() is the main entry point of the startup process.  It
      19                 :  * coordinates database startup, performing WAL recovery, and the
      20                 :  * transition from WAL recovery into normal operations.
      21                 :  *
      22                 :  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
      23                 :  * callers should not call this directly, but use the functions in
      24                 :  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
      25                 :  * to force the WAL to disk.
      26                 :  *
      27                 :  * In addition to those, there are many other functions for interrogating
      28                 :  * the current system state, and for starting/stopping backups.
      29                 :  *
      30                 :  *
      31                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
      32                 :  * Portions Copyright (c) 1994, Regents of the University of California
      33                 :  *
      34                 :  * src/backend/access/transam/xlog.c
      35                 :  *
      36                 :  *-------------------------------------------------------------------------
      37                 :  */
      38                 : 
      39                 : #include "postgres.h"
      40                 : 
      41                 : #include <ctype.h>
      42                 : #include <math.h>
      43                 : #include <time.h>
      44                 : #include <fcntl.h>
      45                 : #include <sys/stat.h>
      46                 : #include <sys/time.h>
      47                 : #include <unistd.h>
      48                 : 
      49                 : #include "access/clog.h"
      50                 : #include "access/commit_ts.h"
      51                 : #include "access/heaptoast.h"
      52                 : #include "access/multixact.h"
      53                 : #include "access/rewriteheap.h"
      54                 : #include "access/subtrans.h"
      55                 : #include "access/timeline.h"
      56                 : #include "access/transam.h"
      57                 : #include "access/twophase.h"
      58                 : #include "access/xact.h"
      59                 : #include "access/xlog_internal.h"
      60                 : #include "access/xlogarchive.h"
      61                 : #include "access/xloginsert.h"
      62                 : #include "access/xlogprefetcher.h"
      63                 : #include "access/xlogreader.h"
      64                 : #include "access/xlogrecovery.h"
      65                 : #include "access/xlogutils.h"
      66                 : #include "backup/basebackup.h"
      67                 : #include "catalog/catversion.h"
      68                 : #include "catalog/pg_control.h"
      69                 : #include "catalog/pg_database.h"
      70                 : #include "common/controldata_utils.h"
      71                 : #include "common/file_utils.h"
      72                 : #include "executor/instrument.h"
      73                 : #include "miscadmin.h"
      74                 : #include "pg_trace.h"
      75                 : #include "pgstat.h"
      76                 : #include "port/atomics.h"
      77                 : #include "port/pg_iovec.h"
      78                 : #include "postmaster/bgwriter.h"
      79                 : #include "postmaster/startup.h"
      80                 : #include "postmaster/walwriter.h"
      81                 : #include "replication/logical.h"
      82                 : #include "replication/origin.h"
      83                 : #include "replication/slot.h"
      84                 : #include "replication/snapbuild.h"
      85                 : #include "replication/walreceiver.h"
      86                 : #include "replication/walsender.h"
      87                 : #include "storage/bufmgr.h"
      88                 : #include "storage/fd.h"
      89                 : #include "storage/ipc.h"
      90                 : #include "storage/large_object.h"
      91                 : #include "storage/latch.h"
      92                 : #include "storage/pmsignal.h"
      93                 : #include "storage/predicate.h"
      94                 : #include "storage/proc.h"
      95                 : #include "storage/procarray.h"
      96                 : #include "storage/reinit.h"
      97                 : #include "storage/smgr.h"
      98                 : #include "storage/spin.h"
      99                 : #include "storage/sync.h"
     100                 : #include "utils/guc_hooks.h"
     101                 : #include "utils/guc_tables.h"
     102                 : #include "utils/memutils.h"
     103                 : #include "utils/ps_status.h"
     104                 : #include "utils/relmapper.h"
     105                 : #include "utils/pg_rusage.h"
     106                 : #include "utils/snapmgr.h"
     107                 : #include "utils/timeout.h"
     108                 : #include "utils/timestamp.h"
     109                 : #include "utils/varlena.h"
     110                 : 
     111                 : extern uint32 bootstrap_data_checksum_version;
     112                 : 
     113                 : /* timeline ID to be used when bootstrapping */
     114                 : #define BootstrapTimeLineID     1
     115                 : 
     116                 : /* User-settable parameters */
     117                 : int         max_wal_size_mb = 1024; /* 1 GB */
     118                 : int         min_wal_size_mb = 80;   /* 80 MB */
     119                 : int         wal_keep_size_mb = 0;
     120                 : int         XLOGbuffers = -1;
     121                 : int         XLogArchiveTimeout = 0;
     122                 : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
     123                 : char       *XLogArchiveCommand = NULL;
     124                 : bool        EnableHotStandby = false;
     125                 : bool        fullPageWrites = true;
     126                 : bool        wal_log_hints = false;
     127                 : int         wal_compression = WAL_COMPRESSION_NONE;
     128                 : char       *wal_consistency_checking_string = NULL;
     129                 : bool       *wal_consistency_checking = NULL;
     130                 : bool        wal_init_zero = true;
     131                 : bool        wal_recycle = true;
     132                 : bool        log_checkpoints = true;
     133                 : int         sync_method = DEFAULT_SYNC_METHOD;
     134                 : int         wal_level = WAL_LEVEL_REPLICA;
     135                 : int         CommitDelay = 0;    /* precommit delay in microseconds */
     136                 : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
     137                 : int         wal_retrieve_retry_interval = 5000;
     138                 : int         max_slot_wal_keep_size_mb = -1;
     139                 : int         wal_decode_buffer_size = 512 * 1024;
     140                 : bool        track_wal_io_timing = false;
     141                 : 
     142                 : #ifdef WAL_DEBUG
     143                 : bool        XLOG_DEBUG = false;
     144                 : #endif
     145                 : 
     146                 : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
     147                 : 
     148                 : /*
     149                 :  * Number of WAL insertion locks to use. A higher value allows more insertions
     150                 :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
     151                 :  * which needs to iterate all the locks.
     152                 :  */
     153                 : #define NUM_XLOGINSERT_LOCKS  8
     154                 : 
     155                 : /*
     156                 :  * Max distance from last checkpoint, before triggering a new xlog-based
     157                 :  * checkpoint.
     158                 :  */
     159                 : int         CheckPointSegments;
     160                 : 
     161                 : /* Estimated distance between checkpoints, in bytes */
     162                 : static double CheckPointDistanceEstimate = 0;
     163                 : static double PrevCheckPointDistance = 0;
     164                 : 
     165                 : /*
     166                 :  * Track whether there were any deferred checks for custom resource managers
     167                 :  * specified in wal_consistency_checking.
     168                 :  */
     169                 : static bool check_wal_consistency_checking_deferred = false;
     170                 : 
     171                 : /*
     172                 :  * GUC support
     173                 :  */
     174                 : const struct config_enum_entry sync_method_options[] = {
     175                 :     {"fsync", SYNC_METHOD_FSYNC, false},
     176                 : #ifdef HAVE_FSYNC_WRITETHROUGH
     177                 :     {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
     178                 : #endif
     179                 :     {"fdatasync", SYNC_METHOD_FDATASYNC, false},
     180                 : #ifdef O_SYNC
     181                 :     {"open_sync", SYNC_METHOD_OPEN, false},
     182                 : #endif
     183                 : #ifdef O_DSYNC
     184                 :     {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
     185                 : #endif
     186                 :     {NULL, 0, false}
     187                 : };
     188                 : 
     189                 : 
     190                 : /*
     191                 :  * Although only "on", "off", and "always" are documented,
     192                 :  * we accept all the likely variants of "on" and "off".
     193                 :  */
     194                 : const struct config_enum_entry archive_mode_options[] = {
     195                 :     {"always", ARCHIVE_MODE_ALWAYS, false},
     196                 :     {"on", ARCHIVE_MODE_ON, false},
     197                 :     {"off", ARCHIVE_MODE_OFF, false},
     198                 :     {"true", ARCHIVE_MODE_ON, true},
     199                 :     {"false", ARCHIVE_MODE_OFF, true},
     200                 :     {"yes", ARCHIVE_MODE_ON, true},
     201                 :     {"no", ARCHIVE_MODE_OFF, true},
     202                 :     {"1", ARCHIVE_MODE_ON, true},
     203                 :     {"0", ARCHIVE_MODE_OFF, true},
     204                 :     {NULL, 0, false}
     205                 : };
     206                 : 
     207                 : /*
     208                 :  * Statistics for current checkpoint are collected in this global struct.
     209                 :  * Because only the checkpointer or a stand-alone backend can perform
     210                 :  * checkpoints, this will be unused in normal backends.
     211                 :  */
     212                 : CheckpointStatsData CheckpointStats;
     213                 : 
     214                 : /*
     215                 :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
     216                 :  * the replayed WAL records indicate. It's initialized with full_page_writes
     217                 :  * that the recovery starting checkpoint record indicates, and then updated
     218                 :  * each time XLOG_FPW_CHANGE record is replayed.
     219                 :  */
     220                 : static bool lastFullPageWrites;
     221                 : 
     222                 : /*
     223                 :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
     224                 :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
     225                 :  * means "not known, need to check the shared state".
     226                 :  */
     227                 : static bool LocalRecoveryInProgress = true;
     228                 : 
     229                 : /*
     230                 :  * Local state for XLogInsertAllowed():
     231                 :  *      1: unconditionally allowed to insert XLOG
     232                 :  *      0: unconditionally not allowed to insert XLOG
     233                 :  *      -1: must check RecoveryInProgress(); disallow until it is false
     234                 :  * Most processes start with -1 and transition to 1 after seeing that recovery
     235                 :  * is not in progress.  But we can also force the value for special cases.
     236                 :  * The coding in XLogInsertAllowed() depends on the first two of these states
     237                 :  * being numerically the same as bool true and false.
     238                 :  */
     239                 : static int  LocalXLogInsertAllowed = -1;
     240                 : 
     241                 : /*
     242                 :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
     243                 :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
     244                 :  * end+1 of the last record, and is reset when we end a top-level transaction,
     245                 :  * or start a new one; so it can be used to tell if the current transaction has
     246                 :  * created any XLOG records.
     247                 :  *
     248                 :  * While in parallel mode, this may not be fully up to date.  When committing,
     249                 :  * a transaction can assume this covers all xlog records written either by the
     250                 :  * user backend or by any parallel worker which was present at any point during
     251                 :  * the transaction.  But when aborting, or when still in parallel mode, other
     252                 :  * parallel backends may have written WAL records at later LSNs than the value
     253                 :  * stored here.  The parallel leader advances its own copy, when necessary,
     254                 :  * in WaitForParallelWorkersToFinish.
     255                 :  */
     256                 : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
     257                 : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
     258                 : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
     259                 : 
     260                 : /*
     261                 :  * RedoRecPtr is this backend's local copy of the REDO record pointer
     262                 :  * (which is almost but not quite the same as a pointer to the most recent
     263                 :  * CHECKPOINT record).  We update this from the shared-memory copy,
     264                 :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
     265                 :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
     266                 :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
     267                 :  * see GetRedoRecPtr.
     268                 :  *
     269                 :  * NB: Code that uses this variable must be prepared not only for the
     270                 :  * possibility that it may be arbitrarily out of date, but also for the
     271                 :  * possibility that it might be set to InvalidXLogRecPtr. We used to
     272                 :  * initialize it as a side effect of the first call to RecoveryInProgress(),
     273                 :  * which meant that most code that might use it could assume that it had a
     274                 :  * real if perhaps stale value. That's no longer the case.
     275                 :  */
     276                 : static XLogRecPtr RedoRecPtr;
     277                 : 
     278                 : /*
     279                 :  * doPageWrites is this backend's local copy of (fullPageWrites ||
     280                 :  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
     281                 :  * a full-page image of a page need to be taken.
     282                 :  *
     283                 :  * NB: Initially this is false, and there's no guarantee that it will be
     284                 :  * initialized to any other value before it is first used. Any code that
     285                 :  * makes use of it must recheck the value after obtaining a WALInsertLock,
     286                 :  * and respond appropriately if it turns out that the previous value wasn't
     287                 :  * accurate.
     288                 :  */
     289                 : static bool doPageWrites;
     290                 : 
     291                 : /*----------
     292                 :  * Shared-memory data structures for XLOG control
     293                 :  *
     294                 :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
     295                 :  * the log up to (all records before that point must be written or fsynced).
     296                 :  * LogwrtResult indicates the byte positions we have already written/fsynced.
     297                 :  * These structs are identical but are declared separately to indicate their
     298                 :  * slightly different functions.
     299                 :  *
     300                 :  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
     301                 :  * WALWriteLock.  To update it, you need to hold both locks.  The point of
     302                 :  * this arrangement is that the value can be examined by code that already
     303                 :  * holds WALWriteLock without needing to grab info_lck as well.  In addition
     304                 :  * to the shared variable, each backend has a private copy of LogwrtResult,
     305                 :  * which is updated when convenient.
     306                 :  *
     307                 :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
     308                 :  * (protected by info_lck), but we don't need to cache any copies of it.
     309                 :  *
     310                 :  * info_lck is only held long enough to read/update the protected variables,
     311                 :  * so it's a plain spinlock.  The other locks are held longer (potentially
     312                 :  * over I/O operations), so we use LWLocks for them.  These locks are:
     313                 :  *
     314                 :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
     315                 :  * It is only held while initializing and changing the mapping.  If the
     316                 :  * contents of the buffer being replaced haven't been written yet, the mapping
     317                 :  * lock is released while the write is done, and reacquired afterwards.
     318                 :  *
     319                 :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
     320                 :  * XLogFlush).
     321                 :  *
     322                 :  * ControlFileLock: must be held to read/update control file or create
     323                 :  * new log file.
     324                 :  *
     325                 :  *----------
     326                 :  */
     327                 : 
     328                 : typedef struct XLogwrtRqst
     329                 : {
     330                 :     XLogRecPtr  Write;          /* last byte + 1 to write out */
     331                 :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
     332                 : } XLogwrtRqst;
     333                 : 
     334                 : typedef struct XLogwrtResult
     335                 : {
     336                 :     XLogRecPtr  Write;          /* last byte + 1 written out */
     337                 :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
     338                 : } XLogwrtResult;
     339                 : 
     340                 : /*
     341                 :  * Inserting to WAL is protected by a small fixed number of WAL insertion
     342                 :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
     343                 :  * matter which one. To lock out other concurrent insertions, you must hold
     344                 :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
     345                 :  * indicator of how far the insertion has progressed (insertingAt).
     346                 :  *
     347                 :  * The insertingAt values are read when a process wants to flush WAL from
     348                 :  * the in-memory buffers to disk, to check that all the insertions to the
     349                 :  * region the process is about to write out have finished. You could simply
     350                 :  * wait for all currently in-progress insertions to finish, but the
     351                 :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
     352                 :  * so that you only wait for the insertions that are modifying the buffers
     353                 :  * you're about to write out.
     354                 :  *
     355                 :  * This isn't just an optimization. If all the WAL buffers are dirty, an
     356                 :  * inserter that's holding a WAL insert lock might need to evict an old WAL
     357                 :  * buffer, which requires flushing the WAL. If it's possible for an inserter
     358                 :  * to block on another inserter unnecessarily, deadlock can arise when two
     359                 :  * inserters holding a WAL insert lock wait for each other to finish their
     360                 :  * insertion.
     361                 :  *
     362                 :  * Small WAL records that don't cross a page boundary never update the value,
     363                 :  * the WAL record is just copied to the page and the lock is released. But
     364                 :  * to avoid the deadlock-scenario explained above, the indicator is always
     365                 :  * updated before sleeping while holding an insertion lock.
     366                 :  *
     367                 :  * lastImportantAt contains the LSN of the last important WAL record inserted
     368                 :  * using a given lock. This value is used to detect if there has been
     369                 :  * important WAL activity since the last time some action, like a checkpoint,
     370                 :  * was performed - allowing to not repeat the action if not. The LSN is
     371                 :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
     372                 :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
     373                 :  * records.  Tracking the WAL activity directly in WALInsertLock has the
     374                 :  * advantage of not needing any additional locks to update the value.
     375                 :  */
     376                 : typedef struct
     377                 : {
     378                 :     LWLock      lock;
     379                 :     XLogRecPtr  insertingAt;
     380                 :     XLogRecPtr  lastImportantAt;
     381                 : } WALInsertLock;
     382                 : 
     383                 : /*
     384                 :  * All the WAL insertion locks are allocated as an array in shared memory. We
     385                 :  * force the array stride to be a power of 2, which saves a few cycles in
     386                 :  * indexing, but more importantly also ensures that individual slots don't
     387                 :  * cross cache line boundaries. (Of course, we have to also ensure that the
     388                 :  * array start address is suitably aligned.)
     389                 :  */
     390                 : typedef union WALInsertLockPadded
     391                 : {
     392                 :     WALInsertLock l;
     393                 :     char        pad[PG_CACHE_LINE_SIZE];
     394                 : } WALInsertLockPadded;
     395                 : 
     396                 : /*
     397                 :  * Session status of running backup, used for sanity checks in SQL-callable
     398                 :  * functions to start and stop backups.
     399                 :  */
     400                 : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
     401                 : 
     402                 : /*
     403                 :  * Shared state data for WAL insertion.
     404                 :  */
     405                 : typedef struct XLogCtlInsert
     406                 : {
     407                 :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
     408                 : 
     409                 :     /*
     410                 :      * CurrBytePos is the end of reserved WAL. The next record will be
     411                 :      * inserted at that position. PrevBytePos is the start position of the
     412                 :      * previously inserted (or rather, reserved) record - it is copied to the
     413                 :      * prev-link of the next record. These are stored as "usable byte
     414                 :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
     415                 :      */
     416                 :     uint64      CurrBytePos;
     417                 :     uint64      PrevBytePos;
     418                 : 
     419                 :     /*
     420                 :      * Make sure the above heavily-contended spinlock and byte positions are
     421                 :      * on their own cache line. In particular, the RedoRecPtr and full page
     422                 :      * write variables below should be on a different cache line. They are
     423                 :      * read on every WAL insertion, but updated rarely, and we don't want
     424                 :      * those reads to steal the cache line containing Curr/PrevBytePos.
     425                 :      */
     426                 :     char        pad[PG_CACHE_LINE_SIZE];
     427                 : 
     428                 :     /*
     429                 :      * fullPageWrites is the authoritative value used by all backends to
     430                 :      * determine whether to write full-page image to WAL. This shared value,
     431                 :      * instead of the process-local fullPageWrites, is required because, when
     432                 :      * full_page_writes is changed by SIGHUP, we must WAL-log it before it
     433                 :      * actually affects WAL-logging by backends.  Checkpointer sets at startup
     434                 :      * or after SIGHUP.
     435                 :      *
     436                 :      * To read these fields, you must hold an insertion lock. To modify them,
     437                 :      * you must hold ALL the locks.
     438                 :      */
     439                 :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
     440                 :     bool        fullPageWrites;
     441                 : 
     442                 :     /*
     443                 :      * runningBackups is a counter indicating the number of backups currently
     444                 :      * in progress. lastBackupStart is the latest checkpoint redo location
     445                 :      * used as a starting point for an online backup.
     446                 :      */
     447                 :     int         runningBackups;
     448                 :     XLogRecPtr  lastBackupStart;
     449                 : 
     450                 :     /*
     451                 :      * WAL insertion locks.
     452                 :      */
     453                 :     WALInsertLockPadded *WALInsertLocks;
     454                 : } XLogCtlInsert;
     455                 : 
     456                 : /*
     457                 :  * Total shared-memory state for XLOG.
     458                 :  */
     459                 : typedef struct XLogCtlData
     460                 : {
     461                 :     XLogCtlInsert Insert;
     462                 : 
     463                 :     /* Protected by info_lck: */
     464                 :     XLogwrtRqst LogwrtRqst;
     465                 :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
     466                 :     FullTransactionId ckptFullXid;  /* nextXid of latest checkpoint */
     467                 :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
     468                 :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
     469                 : 
     470                 :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
     471                 : 
     472                 :     /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
     473                 :     XLogRecPtr  unloggedLSN;
     474                 :     slock_t     ulsn_lck;
     475                 : 
     476                 :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
     477                 :     pg_time_t   lastSegSwitchTime;
     478                 :     XLogRecPtr  lastSegSwitchLSN;
     479                 : 
     480                 :     /*
     481                 :      * Protected by info_lck and WALWriteLock (you must hold either lock to
     482                 :      * read it, but both to update)
     483                 :      */
     484                 :     XLogwrtResult LogwrtResult;
     485                 : 
     486                 :     /*
     487                 :      * Latest initialized page in the cache (last byte position + 1).
     488                 :      *
     489                 :      * To change the identity of a buffer (and InitializedUpTo), you need to
     490                 :      * hold WALBufMappingLock.  To change the identity of a buffer that's
     491                 :      * still dirty, the old page needs to be written out first, and for that
     492                 :      * you need WALWriteLock, and you need to ensure that there are no
     493                 :      * in-progress insertions to the page by calling
     494                 :      * WaitXLogInsertionsToFinish().
     495                 :      */
     496                 :     XLogRecPtr  InitializedUpTo;
     497                 : 
     498                 :     /*
     499                 :      * These values do not change after startup, although the pointed-to pages
     500                 :      * and xlblocks values certainly do.  xlblocks values are protected by
     501                 :      * WALBufMappingLock.
     502                 :      */
     503                 :     char       *pages;          /* buffers for unwritten XLOG pages */
     504                 :     XLogRecPtr *xlblocks;       /* 1st byte ptr-s + XLOG_BLCKSZ */
     505                 :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
     506                 : 
     507                 :     /*
     508                 :      * InsertTimeLineID is the timeline into which new WAL is being inserted
     509                 :      * and flushed. It is zero during recovery, and does not change once set.
     510                 :      *
     511                 :      * If we create a new timeline when the system was started up,
     512                 :      * PrevTimeLineID is the old timeline's ID that we forked off from.
     513                 :      * Otherwise it's equal to InsertTimeLineID.
     514                 :      */
     515                 :     TimeLineID  InsertTimeLineID;
     516                 :     TimeLineID  PrevTimeLineID;
     517                 : 
     518                 :     /*
     519                 :      * SharedRecoveryState indicates if we're still in crash or archive
     520                 :      * recovery.  Protected by info_lck.
     521                 :      */
     522                 :     RecoveryState SharedRecoveryState;
     523                 : 
     524                 :     /*
     525                 :      * InstallXLogFileSegmentActive indicates whether the checkpointer should
     526                 :      * arrange for future segments by recycling and/or PreallocXlogFiles().
     527                 :      * Protected by ControlFileLock.  Only the startup process changes it.  If
     528                 :      * true, anyone can use InstallXLogFileSegment().  If false, the startup
     529                 :      * process owns the exclusive right to install segments, by reading from
     530                 :      * the archive and possibly replacing existing files.
     531                 :      */
     532                 :     bool        InstallXLogFileSegmentActive;
     533                 : 
     534                 :     /*
     535                 :      * WalWriterSleeping indicates whether the WAL writer is currently in
     536                 :      * low-power mode (and hence should be nudged if an async commit occurs).
     537                 :      * Protected by info_lck.
     538                 :      */
     539                 :     bool        WalWriterSleeping;
     540                 : 
     541                 :     /*
     542                 :      * During recovery, we keep a copy of the latest checkpoint record here.
     543                 :      * lastCheckPointRecPtr points to start of checkpoint record and
     544                 :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
     545                 :      * checkpointer when it wants to create a restartpoint.
     546                 :      *
     547                 :      * Protected by info_lck.
     548                 :      */
     549                 :     XLogRecPtr  lastCheckPointRecPtr;
     550                 :     XLogRecPtr  lastCheckPointEndPtr;
     551                 :     CheckPoint  lastCheckPoint;
     552                 : 
     553                 :     /*
     554                 :      * lastFpwDisableRecPtr points to the start of the last replayed
     555                 :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
     556                 :      */
     557                 :     XLogRecPtr  lastFpwDisableRecPtr;
     558                 : 
     559                 :     slock_t     info_lck;       /* locks shared variables shown above */
     560                 : } XLogCtlData;
     561                 : 
     562                 : static XLogCtlData *XLogCtl = NULL;
     563                 : 
     564                 : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
     565                 : static WALInsertLockPadded *WALInsertLocks = NULL;
     566                 : 
     567                 : /*
     568                 :  * We maintain an image of pg_control in shared memory.
     569                 :  */
     570                 : static ControlFileData *ControlFile = NULL;
     571                 : 
     572                 : /*
     573                 :  * Calculate the amount of space left on the page after 'endptr'. Beware
     574                 :  * multiple evaluation!
     575                 :  */
     576                 : #define INSERT_FREESPACE(endptr)    \
     577                 :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
     578                 : 
     579                 : /* Macro to advance to next buffer index. */
     580                 : #define NextBufIdx(idx)     \
     581                 :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
     582                 : 
     583                 : /*
     584                 :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
     585                 :  * would hold if it was in cache, the page containing 'recptr'.
     586                 :  */
     587                 : #define XLogRecPtrToBufIdx(recptr)  \
     588                 :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
     589                 : 
     590                 : /*
     591                 :  * These are the number of bytes in a WAL page usable for WAL data.
     592                 :  */
     593                 : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
     594                 : 
     595                 : /*
     596                 :  * Convert values of GUCs measured in megabytes to equiv. segment count.
     597                 :  * Rounds down.
     598                 :  */
     599                 : #define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
     600                 : 
     601                 : /* The number of bytes in a WAL segment usable for WAL data. */
     602                 : static int  UsableBytesInSegment;
     603                 : 
     604                 : /*
     605                 :  * Private, possibly out-of-date copy of shared LogwrtResult.
     606                 :  * See discussion above.
     607                 :  */
     608                 : static XLogwrtResult LogwrtResult = {0, 0};
     609                 : 
     610                 : /*
     611                 :  * openLogFile is -1 or a kernel FD for an open log file segment.
     612                 :  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
     613                 :  * These variables are only used to write the XLOG, and so will normally refer
     614                 :  * to the active segment.
     615                 :  *
     616                 :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
     617                 :  */
     618                 : static int  openLogFile = -1;
     619                 : static XLogSegNo openLogSegNo = 0;
     620                 : static TimeLineID openLogTLI = 0;
     621                 : 
     622                 : /*
     623                 :  * Local copies of equivalent fields in the control file.  When running
     624                 :  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
     625                 :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
     626                 :  * switched to false to prevent any updates while replaying records.
     627                 :  * Those values are kept consistent as long as crash recovery runs.
     628                 :  */
     629                 : static XLogRecPtr LocalMinRecoveryPoint;
     630                 : static TimeLineID LocalMinRecoveryPointTLI;
     631                 : static bool updateMinRecoveryPoint = true;
     632                 : 
     633                 : /* For WALInsertLockAcquire/Release functions */
     634                 : static int  MyLockNo = 0;
     635                 : static bool holdingAllLocks = false;
     636                 : 
     637                 : #ifdef WAL_DEBUG
     638                 : static MemoryContext walDebugCxt = NULL;
     639                 : #endif
     640                 : 
     641                 : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
     642                 :                                         XLogRecPtr EndOfLog,
     643                 :                                         TimeLineID newTLI);
     644                 : static void CheckRequiredParameterValues(void);
     645                 : static void XLogReportParameters(void);
     646                 : static int  LocalSetXLogInsertAllowed(void);
     647                 : static void CreateEndOfRecoveryRecord(void);
     648                 : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
     649                 :                                                   XLogRecPtr pagePtr,
     650                 :                                                   TimeLineID newTLI);
     651                 : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
     652                 : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
     653                 : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
     654                 : 
     655                 : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
     656                 :                                   bool opportunistic);
     657                 : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
     658                 : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
     659                 :                                    bool find_free, XLogSegNo max_segno,
     660                 :                                    TimeLineID tli);
     661                 : static void XLogFileClose(void);
     662                 : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
     663                 : static void RemoveTempXlogFiles(void);
     664                 : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
     665                 :                                XLogRecPtr endptr, TimeLineID insertTLI);
     666                 : static void RemoveXlogFile(const struct dirent *segment_de,
     667                 :                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
     668                 :                            TimeLineID insertTLI);
     669                 : static void UpdateLastRemovedPtr(char *filename);
     670                 : static void ValidateXLOGDirectoryStructure(void);
     671                 : static void CleanupBackupHistory(void);
     672                 : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
     673                 : static bool PerformRecoveryXLogAction(void);
     674                 : static void InitControlFile(uint64 sysidentifier);
     675                 : static void WriteControlFile(void);
     676                 : static void ReadControlFile(void);
     677                 : static void UpdateControlFile(void);
     678                 : static char *str_time(pg_time_t tnow);
     679                 : 
     680                 : static int  get_sync_bit(int method);
     681                 : 
     682                 : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
     683                 :                                 XLogRecData *rdata,
     684                 :                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
     685                 :                                 TimeLineID tli);
     686                 : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
     687                 :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
     688                 : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
     689                 :                               XLogRecPtr *PrevPtr);
     690                 : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
     691                 : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
     692                 : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
     693                 : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
     694                 : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
     695                 : 
     696                 : static void WALInsertLockAcquire(void);
     697                 : static void WALInsertLockAcquireExclusive(void);
     698                 : static void WALInsertLockRelease(void);
     699                 : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
     700                 : 
     701                 : /*
     702                 :  * Insert an XLOG record represented by an already-constructed chain of data
     703                 :  * chunks.  This is a low-level routine; to construct the WAL record header
     704                 :  * and data, use the higher-level routines in xloginsert.c.
     705                 :  *
     706                 :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
     707                 :  * WAL record applies to, that were not included in the record as full page
     708                 :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
     709                 :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
     710                 :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
     711                 :  * record is always inserted.
     712                 :  *
     713                 :  * 'flags' gives more in-depth control on the record being inserted. See
     714                 :  * XLogSetRecordFlags() for details.
     715                 :  *
     716                 :  * 'topxid_included' tells whether the top-transaction id is logged along with
     717                 :  * current subtransaction. See XLogRecordAssemble().
     718                 :  *
     719                 :  * The first XLogRecData in the chain must be for the record header, and its
     720                 :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
     721                 :  * xl_crc fields in the header, the rest of the header must already be filled
     722                 :  * by the caller.
     723                 :  *
     724                 :  * Returns XLOG pointer to end of record (beginning of next record).
     725                 :  * This can be used as LSN for data pages affected by the logged action.
     726                 :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
     727                 :  * before the data page can be written out.  This implements the basic
     728                 :  * WAL rule "write the log before the data".)
     729                 :  */
     730                 : XLogRecPtr
     731 GIC    19404394 : XLogInsertRecord(XLogRecData *rdata,
     732                 :                  XLogRecPtr fpw_lsn,
     733                 :                  uint8 flags,
     734 ECB             :                  int num_fpi,
     735                 :                  bool topxid_included)
     736                 : {
     737 GIC    19404394 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
     738                 :     pg_crc32c   rdata_crc;
     739                 :     bool        inserted;
     740 CBC    19404394 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
     741 GIC    19404394 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
     742        19404394 :     bool        isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
     743 ECB             :                                info == XLOG_SWITCH);
     744                 :     XLogRecPtr  StartPos;
     745                 :     XLogRecPtr  EndPos;
     746 GIC    19404394 :     bool        prevDoPageWrites = doPageWrites;
     747                 :     TimeLineID  insertTLI;
     748                 : 
     749 ECB             :     /* we assume that all of the record header is in the first chunk */
     750 GIC    19404394 :     Assert(rdata->len >= SizeOfXLogRecord);
     751                 : 
     752                 :     /* cross-check on whether we should be here or not */
     753 CBC    19404394 :     if (!XLogInsertAllowed())
     754 UIC           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
     755                 : 
     756 ECB             :     /*
     757 EUB             :      * Given that we're not in recovery, InsertTimeLineID is set and can't
     758                 :      * change, so we can read it without a lock.
     759                 :      */
     760 GIC    19404394 :     insertTLI = XLogCtl->InsertTimeLineID;
     761                 : 
     762                 :     /*----------
     763 ECB             :      *
     764                 :      * We have now done all the preparatory work we can without holding a
     765                 :      * lock or modifying shared state. From here on, inserting the new WAL
     766                 :      * record to the shared WAL buffer cache is a two-step process:
     767                 :      *
     768                 :      * 1. Reserve the right amount of space from the WAL. The current head of
     769                 :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
     770                 :      *    insertpos_lck.
     771                 :      *
     772                 :      * 2. Copy the record to the reserved WAL space. This involves finding the
     773                 :      *    correct WAL buffer containing the reserved space, and copying the
     774                 :      *    record in place. This can be done concurrently in multiple processes.
     775                 :      *
     776                 :      * To keep track of which insertions are still in-progress, each concurrent
     777                 :      * inserter acquires an insertion lock. In addition to just indicating that
     778                 :      * an insertion is in progress, the lock tells others how far the inserter
     779                 :      * has progressed. There is a small fixed number of insertion locks,
     780                 :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
     781                 :      * boundary, it updates the value stored in the lock to the how far it has
     782                 :      * inserted, to allow the previous buffer to be flushed.
     783                 :      *
     784                 :      * Holding onto an insertion lock also protects RedoRecPtr and
     785                 :      * fullPageWrites from changing until the insertion is finished.
     786                 :      *
     787                 :      * Step 2 can usually be done completely in parallel. If the required WAL
     788                 :      * page is not initialized yet, you have to grab WALBufMappingLock to
     789                 :      * initialize it, but the WAL writer tries to do that ahead of insertions
     790                 :      * to avoid that from happening in the critical path.
     791                 :      *
     792                 :      *----------
     793                 :      */
     794 GIC    19404394 :     START_CRIT_SECTION();
     795        19404394 :     if (isLogSwitch)
     796             442 :         WALInsertLockAcquireExclusive();
     797 ECB             :     else
     798 CBC    19403952 :         WALInsertLockAcquire();
     799 ECB             : 
     800                 :     /*
     801                 :      * Check to see if my copy of RedoRecPtr is out of date. If so, may have
     802                 :      * to go back and have the caller recompute everything. This can only
     803                 :      * happen just after a checkpoint, so it's better to be slow in this case
     804                 :      * and fast otherwise.
     805                 :      *
     806                 :      * Also check to see if fullPageWrites was just turned on or there's a
     807                 :      * running backup (which forces full-page writes); if we weren't already
     808                 :      * doing full-page writes then go back and recompute.
     809                 :      *
     810                 :      * If we aren't doing full-page writes then RedoRecPtr doesn't actually
     811                 :      * affect the contents of the XLOG record, so we'll update our local copy
     812                 :      * but not force a recomputation.  (If doPageWrites was just turned off,
     813                 :      * we could recompute the record without full pages, but we choose not to
     814                 :      * bother.)
     815                 :      */
     816 GIC    19404394 :     if (RedoRecPtr != Insert->RedoRecPtr)
     817                 :     {
     818            4522 :         Assert(RedoRecPtr < Insert->RedoRecPtr);
     819 CBC        4522 :         RedoRecPtr = Insert->RedoRecPtr;
     820                 :     }
     821 GNC    19404394 :     doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
     822 ECB             : 
     823 GIC    19404394 :     if (doPageWrites &&
     824 CBC    19214991 :         (!prevDoPageWrites ||
     825 GIC    18134561 :          (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
     826 ECB             :     {
     827                 :         /*
     828                 :          * Oops, some buffer now needs to be backed up that the caller didn't
     829                 :          * back up.  Start over.
     830                 :          */
     831 GIC        4992 :         WALInsertLockRelease();
     832            4992 :         END_CRIT_SECTION();
     833            4992 :         return InvalidXLogRecPtr;
     834 ECB             :     }
     835                 : 
     836                 :     /*
     837                 :      * Reserve space for the record in the WAL. This also sets the xl_prev
     838                 :      * pointer.
     839                 :      */
     840 GIC    19399402 :     if (isLogSwitch)
     841             300 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
     842                 :     else
     843 ECB             :     {
     844 CBC    19399102 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
     845                 :                                   &rechdr->xl_prev);
     846 GIC    19399102 :         inserted = true;
     847 ECB             :     }
     848                 : 
     849 CBC    19399402 :     if (inserted)
     850                 :     {
     851                 :         /*
     852 ECB             :          * Now that xl_prev has been filled in, calculate CRC of the record
     853                 :          * header.
     854                 :          */
     855 GIC    19399350 :         rdata_crc = rechdr->xl_crc;
     856        19399350 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
     857        19399350 :         FIN_CRC32C(rdata_crc);
     858 CBC    19399350 :         rechdr->xl_crc = rdata_crc;
     859 ECB             : 
     860                 :         /*
     861                 :          * All the record data, including the header, is now ready to be
     862                 :          * inserted. Copy the record in the space reserved.
     863                 :          */
     864 GIC    19399350 :         CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
     865                 :                             StartPos, EndPos, insertTLI);
     866                 : 
     867 ECB             :         /*
     868                 :          * Unless record is flagged as not important, update LSN of last
     869                 :          * important record in the current slot. When holding all locks, just
     870                 :          * update the first one.
     871                 :          */
     872 GIC    19399350 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
     873                 :         {
     874        19259839 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
     875 ECB             : 
     876 GIC    19259839 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
     877 ECB             :         }
     878                 :     }
     879                 :     else
     880                 :     {
     881                 :         /*
     882                 :          * This was an xlog-switch record, but the current insert location was
     883                 :          * already exactly at the beginning of a segment, so there was no need
     884                 :          * to do anything.
     885                 :          */
     886                 :     }
     887                 : 
     888                 :     /*
     889                 :      * Done! Let others know that we're finished.
     890                 :      */
     891 GIC    19399402 :     WALInsertLockRelease();
     892                 : 
     893        19399402 :     END_CRIT_SECTION();
     894 ECB             : 
     895 GIC    19399402 :     MarkCurrentTransactionIdLoggedIfAny();
     896 ECB             : 
     897                 :     /*
     898                 :      * Mark top transaction id is logged (if needed) so that we should not try
     899                 :      * to log it again with the next WAL record in the current subtransaction.
     900                 :      */
     901 GIC    19399402 :     if (topxid_included)
     902             234 :         MarkSubxactTopXidLogged();
     903                 : 
     904 ECB             :     /*
     905                 :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
     906                 :      */
     907 GIC    19399402 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
     908                 :     {
     909          444287 :         SpinLockAcquire(&XLogCtl->info_lck);
     910 ECB             :         /* advance global request to include new block(s) */
     911 GIC      444287 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
     912 CBC      444138 :             XLogCtl->LogwrtRqst.Write = EndPos;
     913                 :         /* update local result copy while I have the chance */
     914          444287 :         LogwrtResult = XLogCtl->LogwrtResult;
     915          444287 :         SpinLockRelease(&XLogCtl->info_lck);
     916                 :     }
     917 ECB             : 
     918                 :     /*
     919                 :      * If this was an XLOG_SWITCH record, flush the record and the empty
     920                 :      * padding space that fills the rest of the segment, and perform
     921                 :      * end-of-segment actions (eg, notifying archiver).
     922                 :      */
     923 GIC    19399402 :     if (isLogSwitch)
     924                 :     {
     925                 :         TRACE_POSTGRESQL_WAL_SWITCH();
     926 CBC         300 :         XLogFlush(EndPos);
     927                 : 
     928                 :         /*
     929 ECB             :          * Even though we reserved the rest of the segment for us, which is
     930                 :          * reflected in EndPos, we return a pointer to just the end of the
     931                 :          * xlog-switch record.
     932                 :          */
     933 GIC         300 :         if (inserted)
     934                 :         {
     935             248 :             EndPos = StartPos + SizeOfXLogRecord;
     936 CBC         248 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
     937                 :             {
     938 LBC           0 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
     939 ECB             : 
     940 UIC           0 :                 if (offset == EndPos % XLOG_BLCKSZ)
     941 UBC           0 :                     EndPos += SizeOfXLogLongPHD;
     942                 :                 else
     943               0 :                     EndPos += SizeOfXLogShortPHD;
     944 EUB             :             }
     945                 :         }
     946                 :     }
     947                 : 
     948                 : #ifdef WAL_DEBUG
     949                 :     if (XLOG_DEBUG)
     950                 :     {
     951                 :         static XLogReaderState *debug_reader = NULL;
     952                 :         XLogRecord *record;
     953                 :         DecodedXLogRecord *decoded;
     954                 :         StringInfoData buf;
     955                 :         StringInfoData recordBuf;
     956                 :         char       *errormsg = NULL;
     957                 :         MemoryContext oldCxt;
     958                 : 
     959                 :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
     960                 : 
     961                 :         initStringInfo(&buf);
     962                 :         appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
     963                 : 
     964                 :         /*
     965                 :          * We have to piece together the WAL record data from the XLogRecData
     966                 :          * entries, so that we can pass it to the rm_desc function as one
     967                 :          * contiguous chunk.
     968                 :          */
     969                 :         initStringInfo(&recordBuf);
     970                 :         for (; rdata != NULL; rdata = rdata->next)
     971                 :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
     972                 : 
     973                 :         /* We also need temporary space to decode the record. */
     974                 :         record = (XLogRecord *) recordBuf.data;
     975                 :         decoded = (DecodedXLogRecord *)
     976                 :             palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
     977                 : 
     978                 :         if (!debug_reader)
     979                 :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
     980                 :                                               XL_ROUTINE(), NULL);
     981                 : 
     982                 :         if (!debug_reader)
     983                 :         {
     984                 :             appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
     985                 :         }
     986                 :         else if (!DecodeXLogRecord(debug_reader,
     987                 :                                    decoded,
     988                 :                                    record,
     989                 :                                    EndPos,
     990                 :                                    &errormsg))
     991                 :         {
     992                 :             appendStringInfo(&buf, "error decoding record: %s",
     993                 :                              errormsg ? errormsg : "no error message");
     994                 :         }
     995                 :         else
     996                 :         {
     997                 :             appendStringInfoString(&buf, " - ");
     998                 : 
     999                 :             debug_reader->record = decoded;
    1000                 :             xlog_outdesc(&buf, debug_reader);
    1001                 :             debug_reader->record = NULL;
    1002                 :         }
    1003                 :         elog(LOG, "%s", buf.data);
    1004                 : 
    1005                 :         pfree(decoded);
    1006                 :         pfree(buf.data);
    1007                 :         pfree(recordBuf.data);
    1008                 :         MemoryContextSwitchTo(oldCxt);
    1009                 :     }
    1010                 : #endif
    1011                 : 
    1012                 :     /*
    1013                 :      * Update our global variables
    1014                 :      */
    1015 GIC    19399402 :     ProcLastRecPtr = StartPos;
    1016        19399402 :     XactLastRecEnd = EndPos;
    1017                 : 
    1018 ECB             :     /* Report WAL traffic to the instrumentation. */
    1019 CBC    19399402 :     if (inserted)
    1020                 :     {
    1021 GIC    19399350 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
    1022 CBC    19399350 :         pgWalUsage.wal_records++;
    1023 GIC    19399350 :         pgWalUsage.wal_fpi += num_fpi;
    1024 ECB             :     }
    1025                 : 
    1026 CBC    19399402 :     return EndPos;
    1027                 : }
    1028                 : 
    1029 ECB             : /*
    1030                 :  * Reserves the right amount of space for a record of given size from the WAL.
    1031                 :  * *StartPos is set to the beginning of the reserved section, *EndPos to
    1032                 :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
    1033                 :  * used to set the xl_prev of this record.
    1034                 :  *
    1035                 :  * This is the performance critical part of XLogInsert that must be serialized
    1036                 :  * across backends. The rest can happen mostly in parallel. Try to keep this
    1037                 :  * section as short as possible, insertpos_lck can be heavily contended on a
    1038                 :  * busy system.
    1039                 :  *
    1040                 :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
    1041                 :  * where we actually copy the record to the reserved space.
    1042                 :  */
    1043                 : static void
    1044 GIC    19399102 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
    1045                 :                           XLogRecPtr *PrevPtr)
    1046                 : {
    1047 CBC    19399102 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1048                 :     uint64      startbytepos;
    1049                 :     uint64      endbytepos;
    1050 ECB             :     uint64      prevbytepos;
    1051                 : 
    1052 GIC    19399102 :     size = MAXALIGN(size);
    1053                 : 
    1054                 :     /* All (non xlog-switch) records should contain data. */
    1055 CBC    19399102 :     Assert(size > SizeOfXLogRecord);
    1056                 : 
    1057                 :     /*
    1058 ECB             :      * The duration the spinlock needs to be held is minimized by minimizing
    1059                 :      * the calculations that have to be done while holding the lock. The
    1060                 :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
    1061                 :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
    1062                 :      * page headers. The mapping between "usable" byte positions and physical
    1063                 :      * positions (XLogRecPtrs) can be done outside the locked region, and
    1064                 :      * because the usable byte position doesn't include any headers, reserving
    1065                 :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
    1066                 :      */
    1067 GIC    19399102 :     SpinLockAcquire(&Insert->insertpos_lck);
    1068                 : 
    1069        19399102 :     startbytepos = Insert->CurrBytePos;
    1070 CBC    19399102 :     endbytepos = startbytepos + size;
    1071 GIC    19399102 :     prevbytepos = Insert->PrevBytePos;
    1072 CBC    19399102 :     Insert->CurrBytePos = endbytepos;
    1073        19399102 :     Insert->PrevBytePos = startbytepos;
    1074 ECB             : 
    1075 CBC    19399102 :     SpinLockRelease(&Insert->insertpos_lck);
    1076 ECB             : 
    1077 GIC    19399102 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1078 CBC    19399102 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1079 GIC    19399102 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1080 ECB             : 
    1081                 :     /*
    1082                 :      * Check that the conversions between "usable byte positions" and
    1083                 :      * XLogRecPtrs work consistently in both directions.
    1084                 :      */
    1085 GIC    19399102 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1086        19399102 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1087        19399102 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1088 CBC    19399102 : }
    1089 ECB             : 
    1090                 : /*
    1091                 :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
    1092                 :  *
    1093                 :  * A log-switch record is handled slightly differently. The rest of the
    1094                 :  * segment will be reserved for this insertion, as indicated by the returned
    1095                 :  * *EndPos value. However, if we are already at the beginning of the current
    1096                 :  * segment, *StartPos and *EndPos are set to the current location without
    1097                 :  * reserving any space, and the function returns false.
    1098                 : */
    1099                 : static bool
    1100 GIC         300 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    1101                 : {
    1102             300 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1103 ECB             :     uint64      startbytepos;
    1104                 :     uint64      endbytepos;
    1105                 :     uint64      prevbytepos;
    1106 GIC         300 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
    1107                 :     XLogRecPtr  ptr;
    1108                 :     uint32      segleft;
    1109 ECB             : 
    1110                 :     /*
    1111                 :      * These calculations are a bit heavy-weight to be done while holding a
    1112                 :      * spinlock, but since we're holding all the WAL insertion locks, there
    1113                 :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
    1114                 :      * compete for it, but that's not called very frequently.
    1115                 :      */
    1116 GIC         300 :     SpinLockAcquire(&Insert->insertpos_lck);
    1117                 : 
    1118             300 :     startbytepos = Insert->CurrBytePos;
    1119 ECB             : 
    1120 GIC         300 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
    1121 CBC         300 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
    1122                 :     {
    1123              52 :         SpinLockRelease(&Insert->insertpos_lck);
    1124              52 :         *EndPos = *StartPos = ptr;
    1125 GIC          52 :         return false;
    1126 ECB             :     }
    1127                 : 
    1128 CBC         248 :     endbytepos = startbytepos + size;
    1129 GIC         248 :     prevbytepos = Insert->PrevBytePos;
    1130                 : 
    1131 CBC         248 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
    1132             248 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    1133                 : 
    1134             248 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
    1135             248 :     if (segleft != wal_segment_size)
    1136                 :     {
    1137 ECB             :         /* consume the rest of the segment */
    1138 CBC         248 :         *EndPos += segleft;
    1139 GIC         248 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
    1140                 :     }
    1141 CBC         248 :     Insert->CurrBytePos = endbytepos;
    1142             248 :     Insert->PrevBytePos = startbytepos;
    1143                 : 
    1144             248 :     SpinLockRelease(&Insert->insertpos_lck);
    1145 ECB             : 
    1146 GIC         248 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
    1147 ECB             : 
    1148 GIC         248 :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
    1149 CBC         248 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    1150 GIC         248 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    1151 CBC         248 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
    1152 ECB             : 
    1153 CBC         248 :     return true;
    1154 ECB             : }
    1155                 : 
    1156                 : /*
    1157                 :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
    1158                 :  * area in the WAL.
    1159                 :  */
    1160                 : static void
    1161 GIC    19399350 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
    1162                 :                     XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
    1163                 : {
    1164 ECB             :     char       *currpos;
    1165                 :     int         freespace;
    1166                 :     int         written;
    1167                 :     XLogRecPtr  CurrPos;
    1168                 :     XLogPageHeader pagehdr;
    1169                 : 
    1170                 :     /*
    1171                 :      * Get a pointer to the right place in the right WAL buffer to start
    1172                 :      * inserting to.
    1173                 :      */
    1174 GIC    19399350 :     CurrPos = StartPos;
    1175        19399350 :     currpos = GetXLogBuffer(CurrPos, tli);
    1176        19399350 :     freespace = INSERT_FREESPACE(CurrPos);
    1177 ECB             : 
    1178                 :     /*
    1179                 :      * there should be enough space for at least the first field (xl_tot_len)
    1180                 :      * on this page.
    1181                 :      */
    1182 GIC    19399350 :     Assert(freespace >= sizeof(uint32));
    1183                 : 
    1184                 :     /* Copy record data */
    1185 CBC    19399350 :     written = 0;
    1186 GIC    85337519 :     while (rdata != NULL)
    1187                 :     {
    1188 CBC    65938169 :         char       *rdata_data = rdata->data;
    1189        65938169 :         int         rdata_len = rdata->len;
    1190                 : 
    1191        66429163 :         while (rdata_len > freespace)
    1192 ECB             :         {
    1193                 :             /*
    1194                 :              * Write what fits on this page, and continue on the next page.
    1195                 :              */
    1196 GIC      490994 :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
    1197          490994 :             memcpy(currpos, rdata_data, freespace);
    1198          490994 :             rdata_data += freespace;
    1199 CBC      490994 :             rdata_len -= freespace;
    1200          490994 :             written += freespace;
    1201          490994 :             CurrPos += freespace;
    1202 ECB             : 
    1203                 :             /*
    1204                 :              * Get pointer to beginning of next page, and set the xlp_rem_len
    1205                 :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
    1206                 :              *
    1207                 :              * It's safe to set the contrecord flag and xlp_rem_len without a
    1208                 :              * lock on the page. All the other flags were already set when the
    1209                 :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
    1210                 :              * only backend that needs to set the contrecord flag.
    1211                 :              */
    1212 GIC      490994 :             currpos = GetXLogBuffer(CurrPos, tli);
    1213          490994 :             pagehdr = (XLogPageHeader) currpos;
    1214          490994 :             pagehdr->xlp_rem_len = write_len - written;
    1215 CBC      490994 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
    1216 ECB             : 
    1217                 :             /* skip over the page header */
    1218 CBC      490994 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
    1219                 :             {
    1220 GIC         426 :                 CurrPos += SizeOfXLogLongPHD;
    1221 CBC         426 :                 currpos += SizeOfXLogLongPHD;
    1222                 :             }
    1223 ECB             :             else
    1224                 :             {
    1225 GIC      490568 :                 CurrPos += SizeOfXLogShortPHD;
    1226          490568 :                 currpos += SizeOfXLogShortPHD;
    1227                 :             }
    1228 CBC      490994 :             freespace = INSERT_FREESPACE(CurrPos);
    1229 ECB             :         }
    1230                 : 
    1231 CBC    65938169 :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
    1232 GIC    65938169 :         memcpy(currpos, rdata_data, rdata_len);
    1233        65938169 :         currpos += rdata_len;
    1234 CBC    65938169 :         CurrPos += rdata_len;
    1235        65938169 :         freespace -= rdata_len;
    1236        65938169 :         written += rdata_len;
    1237 ECB             : 
    1238 CBC    65938169 :         rdata = rdata->next;
    1239 ECB             :     }
    1240 GIC    19399350 :     Assert(written == write_len);
    1241 ECB             : 
    1242                 :     /*
    1243                 :      * If this was an xlog-switch, it's not enough to write the switch record,
    1244                 :      * we also have to consume all the remaining space in the WAL segment.  We
    1245                 :      * have already reserved that space, but we need to actually fill it.
    1246                 :      */
    1247 GIC    19399350 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    1248                 :     {
    1249                 :         /* An xlog-switch record doesn't contain any data besides the header */
    1250 CBC         248 :         Assert(write_len == SizeOfXLogRecord);
    1251                 : 
    1252                 :         /* Assert that we did reserve the right amount of space */
    1253             248 :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
    1254                 : 
    1255                 :         /* Use up all the remaining space on the current page */
    1256             248 :         CurrPos += freespace;
    1257                 : 
    1258                 :         /*
    1259 ECB             :          * Cause all remaining pages in the segment to be flushed, leaving the
    1260                 :          * XLog position where it should be, at the start of the next segment.
    1261                 :          * We do this one page at a time, to make sure we don't deadlock
    1262                 :          * against ourselves if wal_buffers < wal_segment_size.
    1263                 :          */
    1264 GIC      384232 :         while (CurrPos < EndPos)
    1265                 :         {
    1266                 :             /*
    1267 ECB             :              * The minimal action to flush the page would be to call
    1268                 :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
    1269                 :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
    1270                 :              * mostly to zeros, except for the page header (always the short
    1271                 :              * variant, as this is never a segment's first page).
    1272                 :              *
    1273                 :              * The large vistas of zeros are good for compressibility, but the
    1274                 :              * headers interrupting them every XLOG_BLCKSZ (with values that
    1275                 :              * differ from page to page) are not.  The effect varies with
    1276                 :              * compression tool, but bzip2 for instance compresses about an
    1277                 :              * order of magnitude worse if those headers are left in place.
    1278                 :              *
    1279                 :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
    1280                 :              * called in heavily-loaded circumstances as well as this lightly-
    1281                 :              * loaded one) with variant behavior, we just use GetXLogBuffer
    1282                 :              * (which itself calls the two methods we need) to get the pointer
    1283                 :              * and zero most of the page.  Then we just zero the page header.
    1284                 :              */
    1285 GIC      383984 :             currpos = GetXLogBuffer(CurrPos, tli);
    1286         1535936 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
    1287                 : 
    1288 CBC      383984 :             CurrPos += XLOG_BLCKSZ;
    1289 ECB             :         }
    1290                 :     }
    1291                 :     else
    1292                 :     {
    1293                 :         /* Align the end position, so that the next record starts aligned */
    1294 GIC    19399102 :         CurrPos = MAXALIGN64(CurrPos);
    1295                 :     }
    1296                 : 
    1297 CBC    19399350 :     if (CurrPos != EndPos)
    1298 UIC           0 :         elog(PANIC, "space reserved for WAL record does not match what was written");
    1299 GIC    19399350 : }
    1300 ECB             : 
    1301 EUB             : /*
    1302 ECB             :  * Acquire a WAL insertion lock, for inserting to WAL.
    1303                 :  */
    1304                 : static void
    1305 GIC    19403953 : WALInsertLockAcquire(void)
    1306                 : {
    1307                 :     bool        immed;
    1308 ECB             : 
    1309                 :     /*
    1310                 :      * It doesn't matter which of the WAL insertion locks we acquire, so try
    1311                 :      * the one we used last time.  If the system isn't particularly busy, it's
    1312                 :      * a good bet that it's still available, and it's good to have some
    1313                 :      * affinity to a particular lock so that you don't unnecessarily bounce
    1314                 :      * cache lines between processes when there's no contention.
    1315                 :      *
    1316                 :      * If this is the first time through in this backend, pick a lock
    1317                 :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
    1318                 :      * lot of very short connections.
    1319                 :      */
    1320                 :     static int  lockToTry = -1;
    1321                 : 
    1322 GIC    19403953 :     if (lockToTry == -1)
    1323            5924 :         lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
    1324        19403953 :     MyLockNo = lockToTry;
    1325 ECB             : 
    1326                 :     /*
    1327                 :      * The insertingAt value is initially set to 0, as we don't know our
    1328                 :      * insert location yet.
    1329                 :      */
    1330 GIC    19403953 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
    1331        19403953 :     if (!immed)
    1332                 :     {
    1333 ECB             :         /*
    1334                 :          * If we couldn't get the lock immediately, try another lock next
    1335                 :          * time.  On a system with more insertion locks than concurrent
    1336                 :          * inserters, this causes all the inserters to eventually migrate to a
    1337                 :          * lock that no-one else is using.  On a system with more inserters
    1338                 :          * than locks, it still helps to distribute the inserters evenly
    1339                 :          * across the locks.
    1340                 :          */
    1341 GIC         635 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
    1342                 :     }
    1343        19403953 : }
    1344 ECB             : 
    1345                 : /*
    1346                 :  * Acquire all WAL insertion locks, to prevent other backends from inserting
    1347                 :  * to WAL.
    1348                 :  */
    1349                 : static void
    1350 GIC        3502 : WALInsertLockAcquireExclusive(void)
    1351                 : {
    1352                 :     int         i;
    1353 ECB             : 
    1354                 :     /*
    1355                 :      * When holding all the locks, all but the last lock's insertingAt
    1356                 :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
    1357                 :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
    1358                 :      */
    1359 GIC       28016 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
    1360                 :     {
    1361           24514 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1362 CBC       24514 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
    1363 GIC       24514 :                         &WALInsertLocks[i].l.insertingAt,
    1364 ECB             :                         PG_UINT64_MAX);
    1365                 :     }
    1366                 :     /* Variable value reset to 0 at release */
    1367 GIC        3502 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    1368                 : 
    1369            3502 :     holdingAllLocks = true;
    1370 CBC        3502 : }
    1371                 : 
    1372 ECB             : /*
    1373                 :  * Release our insertion lock (or locks, if we're holding them all).
    1374                 :  *
    1375                 :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
    1376                 :  * next time the lock is acquired.
    1377                 :  */
    1378                 : static void
    1379 GIC    19407455 : WALInsertLockRelease(void)
    1380                 : {
    1381        19407455 :     if (holdingAllLocks)
    1382 ECB             :     {
    1383                 :         int         i;
    1384                 : 
    1385 GIC       31518 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1386           28016 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
    1387           28016 :                                   &WALInsertLocks[i].l.insertingAt,
    1388 ECB             :                                   0);
    1389                 : 
    1390 CBC        3502 :         holdingAllLocks = false;
    1391                 :     }
    1392                 :     else
    1393 ECB             :     {
    1394 GIC    19403953 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
    1395        19403953 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
    1396                 :                               0);
    1397 ECB             :     }
    1398 CBC    19407455 : }
    1399                 : 
    1400                 : /*
    1401 ECB             :  * Update our insertingAt value, to let others know that we've finished
    1402                 :  * inserting up to that point.
    1403                 :  */
    1404                 : static void
    1405 GIC      742721 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
    1406                 : {
    1407          742721 :     if (holdingAllLocks)
    1408 ECB             :     {
    1409                 :         /*
    1410                 :          * We use the last lock to mark our actual position, see comments in
    1411                 :          * WALInsertLockAcquireExclusive.
    1412                 :          */
    1413 GIC      380439 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
    1414          380439 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
    1415                 :                         insertingAt);
    1416 ECB             :     }
    1417                 :     else
    1418 GIC      362282 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
    1419          362282 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
    1420                 :                         insertingAt);
    1421 CBC      742721 : }
    1422 ECB             : 
    1423                 : /*
    1424                 :  * Wait for any WAL insertions < upto to finish.
    1425                 :  *
    1426                 :  * Returns the location of the oldest insertion that is still in-progress.
    1427                 :  * Any WAL prior to that point has been fully copied into WAL buffers, and
    1428                 :  * can be flushed out to disk. Because this waits for any insertions older
    1429                 :  * than 'upto' to finish, the return value is always >= 'upto'.
    1430                 :  *
    1431                 :  * Note: When you are about to write out WAL, you must call this function
    1432                 :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
    1433                 :  * need to wait for an insertion to finish (or at least advance to next
    1434                 :  * uninitialized page), and the inserter might need to evict an old WAL buffer
    1435                 :  * to make room for a new one, which in turn requires WALWriteLock.
    1436                 :  */
    1437                 : static XLogRecPtr
    1438 GIC      737182 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
    1439                 : {
    1440                 :     uint64      bytepos;
    1441 ECB             :     XLogRecPtr  reservedUpto;
    1442                 :     XLogRecPtr  finishedUpto;
    1443 GIC      737182 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1444                 :     int         i;
    1445                 : 
    1446 CBC      737182 :     if (MyProc == NULL)
    1447 UIC           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    1448                 : 
    1449 ECB             :     /* Read the current insert position */
    1450 GBC      737182 :     SpinLockAcquire(&Insert->insertpos_lck);
    1451 GIC      737182 :     bytepos = Insert->CurrBytePos;
    1452          737182 :     SpinLockRelease(&Insert->insertpos_lck);
    1453 CBC      737182 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
    1454 ECB             : 
    1455                 :     /*
    1456                 :      * No-one should request to flush a piece of WAL that hasn't even been
    1457                 :      * reserved yet. However, it can happen if there is a block with a bogus
    1458                 :      * LSN on disk, for example. XLogFlush checks for that situation and
    1459                 :      * complains, but only after the flush. Here we just assume that to mean
    1460                 :      * that all WAL that has been reserved needs to be finished. In this
    1461                 :      * corner-case, the return value can be smaller than 'upto' argument.
    1462                 :      */
    1463 GIC      737182 :     if (upto > reservedUpto)
    1464                 :     {
    1465 UIC           0 :         ereport(LOG,
    1466 ECB             :                 (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
    1467                 :                         LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
    1468 UBC           0 :         upto = reservedUpto;
    1469                 :     }
    1470                 : 
    1471 EUB             :     /*
    1472                 :      * Loop through all the locks, sleeping on any in-progress insert older
    1473                 :      * than 'upto'.
    1474                 :      *
    1475                 :      * finishedUpto is our return value, indicating the point upto which all
    1476                 :      * the WAL insertions have been finished. Initialize it to the head of
    1477                 :      * reserved WAL, and as we iterate through the insertion locks, back it
    1478                 :      * out for any insertion that's still in progress.
    1479                 :      */
    1480 GIC      737182 :     finishedUpto = reservedUpto;
    1481         6634638 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    1482                 :     {
    1483 CBC     5897456 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
    1484 ECB             : 
    1485                 :         do
    1486                 :         {
    1487                 :             /*
    1488                 :              * See if this insertion is in progress.  LWLockWaitForVar will
    1489                 :              * wait for the lock to be released, or for the 'value' to be set
    1490                 :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
    1491                 :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
    1492                 :              * know where it's inserting yet.  We will have to wait for it. If
    1493                 :              * it's a small insertion, the record will most likely fit on the
    1494                 :              * same page and the inserter will release the lock without ever
    1495                 :              * calling LWLockUpdateVar.  But if it has to sleep, it will
    1496                 :              * advertise the insertion point with LWLockUpdateVar before
    1497                 :              * sleeping.
    1498                 :              */
    1499 GIC     5899012 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
    1500         5899012 :                                  &WALInsertLocks[i].l.insertingAt,
    1501                 :                                  insertingat, &insertingat))
    1502 ECB             :             {
    1503                 :                 /* the lock was free, so no insertion in progress */
    1504 GIC     2958401 :                 insertingat = InvalidXLogRecPtr;
    1505         2958401 :                 break;
    1506                 :             }
    1507 ECB             : 
    1508                 :             /*
    1509                 :              * This insertion is still in progress. Have to wait, unless the
    1510                 :              * inserter has proceeded past 'upto'.
    1511                 :              */
    1512 GIC     2940611 :         } while (insertingat < upto);
    1513                 : 
    1514         5897456 :         if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
    1515 CBC      435915 :             finishedUpto = insertingat;
    1516                 :     }
    1517          737182 :     return finishedUpto;
    1518 ECB             : }
    1519                 : 
    1520                 : /*
    1521                 :  * Get a pointer to the right location in the WAL buffer containing the
    1522                 :  * given XLogRecPtr.
    1523                 :  *
    1524                 :  * If the page is not initialized yet, it is initialized. That might require
    1525                 :  * evicting an old dirty buffer from the buffer cache, which means I/O.
    1526                 :  *
    1527                 :  * The caller must ensure that the page containing the requested location
    1528                 :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
    1529                 :  * hold onto a WAL insertion lock with the insertingAt position set to
    1530                 :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
    1531                 :  * to evict an old page from the buffer. (This means that once you call
    1532                 :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
    1533                 :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
    1534                 :  * later, because older buffers might be recycled already)
    1535                 :  */
    1536                 : static char *
    1537 GIC    20274329 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
    1538                 : {
    1539                 :     int         idx;
    1540 ECB             :     XLogRecPtr  endptr;
    1541                 :     static uint64 cachedPage = 0;
    1542                 :     static char *cachedPos = NULL;
    1543                 :     XLogRecPtr  expectedEndPtr;
    1544                 : 
    1545                 :     /*
    1546                 :      * Fast path for the common case that we need to access again the same
    1547                 :      * page as last time.
    1548                 :      */
    1549 GIC    20274329 :     if (ptr / XLOG_BLCKSZ == cachedPage)
    1550                 :     {
    1551        19285796 :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1552 CBC    19285796 :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1553 GIC    19285796 :         return cachedPos + ptr % XLOG_BLCKSZ;
    1554 ECB             :     }
    1555                 : 
    1556                 :     /*
    1557                 :      * The XLog buffer cache is organized so that a page is always loaded to a
    1558                 :      * particular buffer.  That way we can easily calculate the buffer a given
    1559                 :      * page must be loaded into, from the XLogRecPtr alone.
    1560                 :      */
    1561 GIC      988533 :     idx = XLogRecPtrToBufIdx(ptr);
    1562                 : 
    1563                 :     /*
    1564 ECB             :      * See what page is loaded in the buffer at the moment. It could be the
    1565                 :      * page we're looking for, or something older. It can't be anything newer
    1566                 :      * - that would imply the page we're looking for has already been written
    1567                 :      * out to disk and evicted, and the caller is responsible for making sure
    1568                 :      * that doesn't happen.
    1569                 :      *
    1570                 :      * However, we don't hold a lock while we read the value. If someone has
    1571                 :      * just initialized the page, it's possible that we get a "torn read" of
    1572                 :      * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
    1573                 :      * that case we will see a bogus value. That's ok, we'll grab the mapping
    1574                 :      * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
    1575                 :      * the page we're looking for. But it means that when we do this unlocked
    1576                 :      * read, we might see a value that appears to be ahead of the page we're
    1577                 :      * looking for. Don't PANIC on that, until we've verified the value while
    1578                 :      * holding the lock.
    1579                 :      */
    1580 GIC      988533 :     expectedEndPtr = ptr;
    1581          988533 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
    1582                 : 
    1583 CBC      988533 :     endptr = XLogCtl->xlblocks[idx];
    1584          988533 :     if (expectedEndPtr != endptr)
    1585                 :     {
    1586 ECB             :         XLogRecPtr  initializedUpto;
    1587                 : 
    1588                 :         /*
    1589                 :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
    1590                 :          * know how far we're finished with inserting the record.
    1591                 :          *
    1592                 :          * NB: If 'ptr' points to just after the page header, advertise a
    1593                 :          * position at the beginning of the page rather than 'ptr' itself. If
    1594                 :          * there are no other insertions running, someone might try to flush
    1595                 :          * up to our advertised location. If we advertised a position after
    1596                 :          * the page header, someone might try to flush the page header, even
    1597                 :          * though page might actually not be initialized yet. As the first
    1598                 :          * inserter on the page, we are effectively responsible for making
    1599                 :          * sure that it's initialized, before we let insertingAt to move past
    1600                 :          * the page header.
    1601                 :          */
    1602 GIC      742721 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
    1603           12172 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
    1604           12172 :             initializedUpto = ptr - SizeOfXLogShortPHD;
    1605 CBC      730549 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
    1606             343 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
    1607             173 :             initializedUpto = ptr - SizeOfXLogLongPHD;
    1608 ECB             :         else
    1609 CBC      730376 :             initializedUpto = ptr;
    1610 ECB             : 
    1611 GIC      742721 :         WALInsertLockUpdateInsertingAt(initializedUpto);
    1612 ECB             : 
    1613 GIC      742721 :         AdvanceXLInsertBuffer(ptr, tli, false);
    1614 CBC      742721 :         endptr = XLogCtl->xlblocks[idx];
    1615                 : 
    1616          742721 :         if (expectedEndPtr != endptr)
    1617 LBC           0 :             elog(PANIC, "could not find WAL buffer for %X/%X",
    1618                 :                  LSN_FORMAT_ARGS(ptr));
    1619 ECB             :     }
    1620 EUB             :     else
    1621                 :     {
    1622                 :         /*
    1623                 :          * Make sure the initialization of the page is visible to us, and
    1624                 :          * won't arrive later to overwrite the WAL data we write on the page.
    1625                 :          */
    1626 GIC      245812 :         pg_memory_barrier();
    1627                 :     }
    1628                 : 
    1629 ECB             :     /*
    1630                 :      * Found the buffer holding this page. Return a pointer to the right
    1631                 :      * offset within the page.
    1632                 :      */
    1633 GIC      988533 :     cachedPage = ptr / XLOG_BLCKSZ;
    1634          988533 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
    1635                 : 
    1636 CBC      988533 :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
    1637          988533 :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
    1638                 : 
    1639          988533 :     return cachedPos + ptr % XLOG_BLCKSZ;
    1640 ECB             : }
    1641                 : 
    1642                 : /*
    1643                 :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
    1644                 :  * is the position starting from the beginning of WAL, excluding all WAL
    1645                 :  * page headers.
    1646                 :  */
    1647                 : static XLogRecPtr
    1648 GIC    38806008 : XLogBytePosToRecPtr(uint64 bytepos)
    1649                 : {
    1650                 :     uint64      fullsegs;
    1651 ECB             :     uint64      fullpages;
    1652                 :     uint64      bytesleft;
    1653                 :     uint32      seg_offset;
    1654                 :     XLogRecPtr  result;
    1655                 : 
    1656 GIC    38806008 :     fullsegs = bytepos / UsableBytesInSegment;
    1657        38806008 :     bytesleft = bytepos % UsableBytesInSegment;
    1658                 : 
    1659 CBC    38806008 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1660 ECB             :     {
    1661                 :         /* fits on first page of segment */
    1662 CBC       73916 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
    1663                 :     }
    1664                 :     else
    1665 ECB             :     {
    1666                 :         /* account for the first page on segment with long header */
    1667 GIC    38732092 :         seg_offset = XLOG_BLCKSZ;
    1668        38732092 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1669                 : 
    1670 CBC    38732092 :         fullpages = bytesleft / UsableBytesInPage;
    1671        38732092 :         bytesleft = bytesleft % UsableBytesInPage;
    1672                 : 
    1673        38732092 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1674 ECB             :     }
    1675                 : 
    1676 CBC    38806008 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1677                 : 
    1678 GIC    38806008 :     return result;
    1679 ECB             : }
    1680                 : 
    1681                 : /*
    1682                 :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
    1683                 :  * returns a pointer to the beginning of the page (ie. before page header),
    1684                 :  * not to where the first xlog record on that page would go to. This is used
    1685                 :  * when converting a pointer to the end of a record.
    1686                 :  */
    1687                 : static XLogRecPtr
    1688 GIC    20136832 : XLogBytePosToEndRecPtr(uint64 bytepos)
    1689                 : {
    1690                 :     uint64      fullsegs;
    1691 ECB             :     uint64      fullpages;
    1692                 :     uint64      bytesleft;
    1693                 :     uint32      seg_offset;
    1694                 :     XLogRecPtr  result;
    1695                 : 
    1696 GIC    20136832 :     fullsegs = bytepos / UsableBytesInSegment;
    1697        20136832 :     bytesleft = bytepos % UsableBytesInSegment;
    1698                 : 
    1699 CBC    20136832 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    1700 ECB             :     {
    1701                 :         /* fits on first page of segment */
    1702 CBC      394401 :         if (bytesleft == 0)
    1703 GIC      357373 :             seg_offset = 0;
    1704                 :         else
    1705 CBC       37028 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
    1706 ECB             :     }
    1707                 :     else
    1708                 :     {
    1709                 :         /* account for the first page on segment with long header */
    1710 GIC    19742431 :         seg_offset = XLOG_BLCKSZ;
    1711        19742431 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
    1712                 : 
    1713 CBC    19742431 :         fullpages = bytesleft / UsableBytesInPage;
    1714        19742431 :         bytesleft = bytesleft % UsableBytesInPage;
    1715                 : 
    1716        19742431 :         if (bytesleft == 0)
    1717           20224 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
    1718                 :         else
    1719        19722207 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    1720 ECB             :     }
    1721                 : 
    1722 CBC    20136832 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
    1723                 : 
    1724 GIC    20136832 :     return result;
    1725 ECB             : }
    1726                 : 
    1727                 : /*
    1728                 :  * Convert an XLogRecPtr to a "usable byte position".
    1729                 :  */
    1730                 : static uint64
    1731 GIC    58200582 : XLogRecPtrToBytePos(XLogRecPtr ptr)
    1732                 : {
    1733                 :     uint64      fullsegs;
    1734 ECB             :     uint32      fullpages;
    1735                 :     uint32      offset;
    1736                 :     uint64      result;
    1737                 : 
    1738 GIC    58200582 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
    1739                 : 
    1740        58200582 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
    1741 CBC    58200582 :     offset = ptr % XLOG_BLCKSZ;
    1742                 : 
    1743        58200582 :     if (fullpages == 0)
    1744 ECB             :     {
    1745 GIC      111171 :         result = fullsegs * UsableBytesInSegment;
    1746 CBC      111171 :         if (offset > 0)
    1747                 :         {
    1748          110641 :             Assert(offset >= SizeOfXLogLongPHD);
    1749          110641 :             result += offset - SizeOfXLogLongPHD;
    1750                 :         }
    1751 ECB             :     }
    1752                 :     else
    1753                 :     {
    1754 GIC    58089411 :         result = fullsegs * UsableBytesInSegment +
    1755        58089411 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
    1756        58089411 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
    1757 CBC    58089411 :         if (offset > 0)
    1758 ECB             :         {
    1759 CBC    58069581 :             Assert(offset >= SizeOfXLogShortPHD);
    1760        58069581 :             result += offset - SizeOfXLogShortPHD;
    1761                 :         }
    1762 ECB             :     }
    1763                 : 
    1764 GIC    58200582 :     return result;
    1765                 : }
    1766                 : 
    1767 ECB             : /*
    1768                 :  * Initialize XLOG buffers, writing out old buffers if they still contain
    1769                 :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
    1770                 :  * true, initialize as many pages as we can without having to write out
    1771                 :  * unwritten data. Any new pages are initialized to zeros, with pages headers
    1772                 :  * initialized properly.
    1773                 :  */
    1774                 : static void
    1775 GIC      749930 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
    1776                 : {
    1777          749930 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    1778 ECB             :     int         nextidx;
    1779                 :     XLogRecPtr  OldPageRqstPtr;
    1780                 :     XLogwrtRqst WriteRqst;
    1781 GIC      749930 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
    1782                 :     XLogRecPtr  NewPageBeginPtr;
    1783                 :     XLogPageHeader NewPage;
    1784 CBC      749930 :     int         npages pg_attribute_unused() = 0;
    1785                 : 
    1786 GIC      749930 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    1787 ECB             : 
    1788                 :     /*
    1789                 :      * Now that we have the lock, check if someone initialized the page
    1790                 :      * already.
    1791                 :      */
    1792 GIC     2102859 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
    1793                 :     {
    1794         1360138 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
    1795 ECB             : 
    1796                 :         /*
    1797                 :          * Get ending-offset of the buffer page we need to replace (this may
    1798                 :          * be zero if the buffer hasn't been used yet).  Fall through if it's
    1799                 :          * already written out.
    1800                 :          */
    1801 GIC     1360138 :         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
    1802         1360138 :         if (LogwrtResult.Write < OldPageRqstPtr)
    1803                 :         {
    1804 ECB             :             /*
    1805                 :              * Nope, got work to do. If we just want to pre-initialize as much
    1806                 :              * as we can without flushing, give up now.
    1807                 :              */
    1808 GIC      442792 :             if (opportunistic)
    1809            7209 :                 break;
    1810                 : 
    1811 ECB             :             /* Before waiting, get info_lck and update LogwrtResult */
    1812 CBC      435583 :             SpinLockAcquire(&XLogCtl->info_lck);
    1813 GIC      435583 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
    1814          357826 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
    1815 CBC      435583 :             LogwrtResult = XLogCtl->LogwrtResult;
    1816          435583 :             SpinLockRelease(&XLogCtl->info_lck);
    1817 ECB             : 
    1818                 :             /*
    1819                 :              * Now that we have an up-to-date LogwrtResult value, see if we
    1820                 :              * still need to write it or if someone else already did.
    1821                 :              */
    1822 GIC      435583 :             if (LogwrtResult.Write < OldPageRqstPtr)
    1823                 :             {
    1824                 :                 /*
    1825 ECB             :                  * Must acquire write lock. Release WALBufMappingLock first,
    1826                 :                  * to make sure that all insertions that we need to wait for
    1827                 :                  * can finish (up to this same position). Otherwise we risk
    1828                 :                  * deadlock.
    1829                 :                  */
    1830 GIC      433903 :                 LWLockRelease(WALBufMappingLock);
    1831                 : 
    1832          433903 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
    1833 ECB             : 
    1834 GIC      433903 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    1835 ECB             : 
    1836 GIC      433903 :                 LogwrtResult = XLogCtl->LogwrtResult;
    1837 CBC      433903 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
    1838                 :                 {
    1839 ECB             :                     /* OK, someone wrote it already */
    1840 CBC        1477 :                     LWLockRelease(WALWriteLock);
    1841                 :                 }
    1842                 :                 else
    1843 ECB             :                 {
    1844                 :                     /* Have to write it ourselves */
    1845                 :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
    1846 GIC      432426 :                     WriteRqst.Write = OldPageRqstPtr;
    1847          432426 :                     WriteRqst.Flush = 0;
    1848          432426 :                     XLogWrite(WriteRqst, tli, false);
    1849 CBC      432426 :                     LWLockRelease(WALWriteLock);
    1850          432426 :                     PendingWalStats.wal_buffers_full++;
    1851 ECB             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
    1852                 :                 }
    1853                 :                 /* Re-acquire WALBufMappingLock and retry */
    1854 GIC      433903 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
    1855          433903 :                 continue;
    1856                 :             }
    1857 ECB             :         }
    1858                 : 
    1859                 :         /*
    1860                 :          * Now the next buffer slot is free and we can set it up to be the
    1861                 :          * next output page.
    1862                 :          */
    1863 GIC      919026 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
    1864          919026 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
    1865                 : 
    1866 CBC      919026 :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
    1867 ECB             : 
    1868 GIC      919026 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
    1869 ECB             : 
    1870                 :         /*
    1871                 :          * Be sure to re-zero the buffer so that bytes beyond what we've
    1872                 :          * written will look like zeroes and not valid XLOG records...
    1873                 :          */
    1874 GIC      919026 :         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
    1875                 : 
    1876                 :         /*
    1877 ECB             :          * Fill the new page's header
    1878                 :          */
    1879 GIC      919026 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
    1880                 : 
    1881                 :         /* NewPage->xlp_info = 0; */ /* done by memset */
    1882 CBC      919026 :         NewPage->xlp_tli = tli;
    1883 GIC      919026 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
    1884                 : 
    1885 ECB             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
    1886                 : 
    1887                 :         /*
    1888                 :          * If online backup is not in progress, mark the header to indicate
    1889                 :          * that WAL records beginning in this page have removable backup
    1890                 :          * blocks.  This allows the WAL archiver to know whether it is safe to
    1891                 :          * compress archived WAL data by transforming full-block records into
    1892                 :          * the non-full-block format.  It is sufficient to record this at the
    1893                 :          * page level because we force a page switch (in fact a segment
    1894                 :          * switch) when starting a backup, so the flag will be off before any
    1895                 :          * records can be written during the backup.  At the end of a backup,
    1896                 :          * the last page will be marked as all unsafe when perhaps only part
    1897                 :          * is unsafe, but at worst the archiver would miss the opportunity to
    1898                 :          * compress a few records.
    1899                 :          */
    1900 GNC      919026 :         if (Insert->runningBackups == 0)
    1901 GIC      810359 :             NewPage->xlp_info |= XLP_BKP_REMOVABLE;
    1902                 : 
    1903 ECB             :         /*
    1904                 :          * If first page of an XLOG segment file, make it a long header.
    1905                 :          */
    1906 GIC      919026 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
    1907                 :         {
    1908             613 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
    1909 ECB             : 
    1910 GIC         613 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
    1911 CBC         613 :             NewLongPage->xlp_seg_size = wal_segment_size;
    1912 GIC         613 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    1913 CBC         613 :             NewPage->xlp_info |= XLP_LONG_HEADER;
    1914 ECB             :         }
    1915                 : 
    1916                 :         /*
    1917                 :          * Make sure the initialization of the page becomes visible to others
    1918                 :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
    1919                 :          * holding a lock.
    1920                 :          */
    1921 GIC      919026 :         pg_write_barrier();
    1922                 : 
    1923          919026 :         *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
    1924 ECB             : 
    1925 GIC      919026 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
    1926 ECB             : 
    1927 GIC      919026 :         npages++;
    1928 ECB             :     }
    1929 GIC      749930 :     LWLockRelease(WALBufMappingLock);
    1930 ECB             : 
    1931                 : #ifdef WAL_DEBUG
    1932                 :     if (XLOG_DEBUG && npages > 0)
    1933                 :     {
    1934                 :         elog(DEBUG1, "initialized %d pages, up to %X/%X",
    1935                 :              npages, LSN_FORMAT_ARGS(NewPageEndPtr));
    1936                 :     }
    1937                 : #endif
    1938 GIC      749930 : }
    1939                 : 
    1940                 : /*
    1941 ECB             :  * Calculate CheckPointSegments based on max_wal_size_mb and
    1942                 :  * checkpoint_completion_target.
    1943                 :  */
    1944                 : static void
    1945 GIC        9091 : CalculateCheckpointSegments(void)
    1946                 : {
    1947                 :     double      target;
    1948 ECB             : 
    1949                 :     /*-------
    1950                 :      * Calculate the distance at which to trigger a checkpoint, to avoid
    1951                 :      * exceeding max_wal_size_mb. This is based on two assumptions:
    1952                 :      *
    1953                 :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
    1954                 :      *    WAL for two checkpoint cycles to allow us to recover from the
    1955                 :      *    secondary checkpoint if the first checkpoint failed, though we
    1956                 :      *    only did this on the primary anyway, not on standby. Keeping just
    1957                 :      *    one checkpoint simplifies processing and reduces disk space in
    1958                 :      *    many smaller databases.)
    1959                 :      * b) during checkpoint, we consume checkpoint_completion_target *
    1960                 :      *    number of segments consumed between checkpoints.
    1961                 :      *-------
    1962                 :      */
    1963 GIC        9091 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
    1964            9091 :         (1.0 + CheckPointCompletionTarget);
    1965                 : 
    1966 ECB             :     /* round down */
    1967 CBC        9091 :     CheckPointSegments = (int) target;
    1968                 : 
    1969 GIC        9091 :     if (CheckPointSegments < 1)
    1970 CBC           9 :         CheckPointSegments = 1;
    1971 GIC        9091 : }
    1972 ECB             : 
    1973                 : void
    1974 CBC        6012 : assign_max_wal_size(int newval, void *extra)
    1975                 : {
    1976 GIC        6012 :     max_wal_size_mb = newval;
    1977 CBC        6012 :     CalculateCheckpointSegments();
    1978 GIC        6012 : }
    1979 ECB             : 
    1980                 : void
    1981 CBC        1857 : assign_checkpoint_completion_target(double newval, void *extra)
    1982                 : {
    1983 GIC        1857 :     CheckPointCompletionTarget = newval;
    1984 CBC        1857 :     CalculateCheckpointSegments();
    1985 GIC        1857 : }
    1986 ECB             : 
    1987                 : /*
    1988                 :  * At a checkpoint, how many WAL segments to recycle as preallocated future
    1989                 :  * XLOG segments? Returns the highest segment that should be preallocated.
    1990                 :  */
    1991                 : static XLogSegNo
    1992 GIC        2363 : XLOGfileslop(XLogRecPtr lastredoptr)
    1993                 : {
    1994                 :     XLogSegNo   minSegNo;
    1995 ECB             :     XLogSegNo   maxSegNo;
    1996                 :     double      distance;
    1997                 :     XLogSegNo   recycleSegNo;
    1998                 : 
    1999                 :     /*
    2000                 :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
    2001                 :      * correspond to. Always recycle enough segments to meet the minimum, and
    2002                 :      * remove enough segments to stay below the maximum.
    2003                 :      */
    2004 GIC        2363 :     minSegNo = lastredoptr / wal_segment_size +
    2005            2363 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
    2006            2363 :     maxSegNo = lastredoptr / wal_segment_size +
    2007 CBC        2363 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
    2008 ECB             : 
    2009                 :     /*
    2010                 :      * Between those limits, recycle enough segments to get us through to the
    2011                 :      * estimated end of next checkpoint.
    2012                 :      *
    2013                 :      * To estimate where the next checkpoint will finish, assume that the
    2014                 :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
    2015                 :      * every checkpoint.
    2016                 :      */
    2017 GIC        2363 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
    2018                 :     /* add 10% for good measure. */
    2019            2363 :     distance *= 1.10;
    2020 ECB             : 
    2021 GIC        2363 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
    2022 ECB             :                                     wal_segment_size);
    2023                 : 
    2024 CBC        2363 :     if (recycleSegNo < minSegNo)
    2025 GIC        2219 :         recycleSegNo = minSegNo;
    2026            2363 :     if (recycleSegNo > maxSegNo)
    2027 CBC          55 :         recycleSegNo = maxSegNo;
    2028 ECB             : 
    2029 CBC        2363 :     return recycleSegNo;
    2030 ECB             : }
    2031                 : 
    2032                 : /*
    2033                 :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
    2034                 :  *
    2035                 :  * new_segno indicates a log file that has just been filled up (or read
    2036                 :  * during recovery). We measure the distance from RedoRecPtr to new_segno
    2037                 :  * and see if that exceeds CheckPointSegments.
    2038                 :  *
    2039                 :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
    2040                 :  */
    2041                 : bool
    2042 GIC         865 : XLogCheckpointNeeded(XLogSegNo new_segno)
    2043                 : {
    2044                 :     XLogSegNo   old_segno;
    2045 ECB             : 
    2046 GIC         865 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
    2047                 : 
    2048             865 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
    2049 CBC         147 :         return true;
    2050 GIC         718 :     return false;
    2051 ECB             : }
    2052                 : 
    2053                 : /*
    2054                 :  * Write and/or fsync the log at least as far as WriteRqst indicates.
    2055                 :  *
    2056                 :  * If flexible == true, we don't have to write as far as WriteRqst, but
    2057                 :  * may stop at any convenient boundary (such as a cache or logfile boundary).
    2058                 :  * This option allows us to avoid uselessly issuing multiple writes when a
    2059                 :  * single one would do.
    2060                 :  *
    2061                 :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
    2062                 :  * must be called before grabbing the lock, to make sure the data is ready to
    2063                 :  * write.
    2064                 :  */
    2065                 : static void
    2066 GIC      731996 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
    2067                 : {
    2068                 :     bool        ispartialpage;
    2069 ECB             :     bool        last_iteration;
    2070                 :     bool        finishing_seg;
    2071                 :     int         curridx;
    2072                 :     int         npages;
    2073                 :     int         startidx;
    2074                 :     uint32      startoffset;
    2075                 : 
    2076                 :     /* We should always be inside a critical section here */
    2077 GIC      731996 :     Assert(CritSectionCount > 0);
    2078                 : 
    2079                 :     /*
    2080 ECB             :      * Update local LogwrtResult (caller probably did this already, but...)
    2081                 :      */
    2082 GIC      731996 :     LogwrtResult = XLogCtl->LogwrtResult;
    2083                 : 
    2084                 :     /*
    2085 ECB             :      * Since successive pages in the xlog cache are consecutively allocated,
    2086                 :      * we can usually gather multiple pages together and issue just one
    2087                 :      * write() call.  npages is the number of pages we have determined can be
    2088                 :      * written together; startidx is the cache block index of the first one,
    2089                 :      * and startoffset is the file offset at which it should go. The latter
    2090                 :      * two variables are only valid when npages > 0, but we must initialize
    2091                 :      * all of them to keep the compiler quiet.
    2092                 :      */
    2093 GIC      731996 :     npages = 0;
    2094          731996 :     startidx = 0;
    2095          731996 :     startoffset = 0;
    2096 ECB             : 
    2097                 :     /*
    2098                 :      * Within the loop, curridx is the cache block index of the page to
    2099                 :      * consider writing.  Begin at the buffer containing the next unwritten
    2100                 :      * page, or last partially written page.
    2101                 :      */
    2102 GIC      731996 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
    2103                 : 
    2104         1617944 :     while (LogwrtResult.Write < WriteRqst.Write)
    2105 ECB             :     {
    2106                 :         /*
    2107                 :          * Make sure we're not ahead of the insert process.  This could happen
    2108                 :          * if we're passed a bogus WriteRqst.Write that is past the end of the
    2109                 :          * last page that's been initialized by AdvanceXLInsertBuffer.
    2110                 :          */
    2111 GIC     1184781 :         XLogRecPtr  EndPtr = XLogCtl->xlblocks[curridx];
    2112                 : 
    2113         1184781 :         if (LogwrtResult.Write >= EndPtr)
    2114 LBC           0 :             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
    2115                 :                  LSN_FORMAT_ARGS(LogwrtResult.Write),
    2116 ECB             :                  LSN_FORMAT_ARGS(EndPtr));
    2117 EUB             : 
    2118                 :         /* Advance LogwrtResult.Write to end of current buffer page */
    2119 GIC     1184781 :         LogwrtResult.Write = EndPtr;
    2120         1184781 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
    2121                 : 
    2122 CBC     1184781 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2123 ECB             :                              wal_segment_size))
    2124                 :         {
    2125                 :             /*
    2126                 :              * Switch to new logfile segment.  We cannot have any pending
    2127                 :              * pages here (since we dump what we have at segment end).
    2128                 :              */
    2129 GIC        7172 :             Assert(npages == 0);
    2130            7172 :             if (openLogFile >= 0)
    2131            1626 :                 XLogFileClose();
    2132 CBC        7172 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2133 ECB             :                             wal_segment_size);
    2134 CBC        7172 :             openLogTLI = tli;
    2135 ECB             : 
    2136                 :             /* create/use new log file */
    2137 CBC        7172 :             openLogFile = XLogFileInit(openLogSegNo, tli);
    2138 GIC        7172 :             ReserveExternalFD();
    2139                 :         }
    2140 ECB             : 
    2141                 :         /* Make sure we have the current logfile open */
    2142 GIC     1184781 :         if (openLogFile < 0)
    2143                 :         {
    2144 UIC           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2145 ECB             :                             wal_segment_size);
    2146 UIC           0 :             openLogTLI = tli;
    2147 UBC           0 :             openLogFile = XLogFileOpen(openLogSegNo, tli);
    2148 UIC           0 :             ReserveExternalFD();
    2149 EUB             :         }
    2150                 : 
    2151                 :         /* Add current page to the set of pending pages-to-dump */
    2152 GIC     1184781 :         if (npages == 0)
    2153                 :         {
    2154                 :             /* first of group */
    2155 CBC      736563 :             startidx = curridx;
    2156 GIC      736563 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
    2157                 :                                             wal_segment_size);
    2158 ECB             :         }
    2159 CBC     1184781 :         npages++;
    2160                 : 
    2161                 :         /*
    2162 ECB             :          * Dump the set if this will be the last loop iteration, or if we are
    2163                 :          * at the last page of the cache area (since the next page won't be
    2164                 :          * contiguous in memory), or if we are at the end of the logfile
    2165                 :          * segment.
    2166                 :          */
    2167 GIC     1184781 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
    2168                 : 
    2169         2075101 :         finishing_seg = !ispartialpage &&
    2170 CBC      890320 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
    2171                 : 
    2172         1184781 :         if (last_iteration ||
    2173          453199 :             curridx == XLogCtl->XLogCacheBlck ||
    2174                 :             finishing_seg)
    2175 ECB             :         {
    2176                 :             char       *from;
    2177                 :             Size        nbytes;
    2178                 :             Size        nleft;
    2179                 :             int         written;
    2180                 :             instr_time  start;
    2181                 : 
    2182                 :             /* OK to write the page(s) */
    2183 GIC      736563 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    2184          736563 :             nbytes = npages * (Size) XLOG_BLCKSZ;
    2185          736563 :             nleft = nbytes;
    2186 ECB             :             do
    2187                 :             {
    2188 CBC      736563 :                 errno = 0;
    2189                 : 
    2190                 :                 /* Measure I/O timing to write WAL data */
    2191          736563 :                 if (track_wal_io_timing)
    2192 UIC           0 :                     INSTR_TIME_SET_CURRENT(start);
    2193                 :                 else
    2194 GNC      736563 :                     INSTR_TIME_SET_ZERO(start);
    2195                 : 
    2196 CBC      736563 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
    2197 GBC      736563 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
    2198 GIC      736563 :                 pgstat_report_wait_end();
    2199 ECB             : 
    2200                 :                 /*
    2201                 :                  * Increment the I/O timing and the number of times WAL data
    2202                 :                  * were written out to disk.
    2203                 :                  */
    2204 GIC      736563 :                 if (track_wal_io_timing)
    2205                 :                 {
    2206                 :                     instr_time  duration;
    2207                 : 
    2208 UIC           0 :                     INSTR_TIME_SET_CURRENT(duration);
    2209 UNC           0 :                     INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, duration, start);
    2210                 :                 }
    2211                 : 
    2212 GBC      736563 :                 PendingWalStats.wal_write++;
    2213 EUB             : 
    2214 GIC      736563 :                 if (written <= 0)
    2215                 :                 {
    2216 ECB             :                     char        xlogfname[MAXFNAMELEN];
    2217                 :                     int         save_errno;
    2218                 : 
    2219 UIC           0 :                     if (errno == EINTR)
    2220               0 :                         continue;
    2221                 : 
    2222               0 :                     save_errno = errno;
    2223 UBC           0 :                     XLogFileName(xlogfname, tli, openLogSegNo,
    2224 EUB             :                                  wal_segment_size);
    2225 UIC           0 :                     errno = save_errno;
    2226 UBC           0 :                     ereport(PANIC,
    2227 EUB             :                             (errcode_for_file_access(),
    2228                 :                              errmsg("could not write to log file %s "
    2229                 :                                     "at offset %u, length %zu: %m",
    2230                 :                                     xlogfname, startoffset, nleft)));
    2231                 :                 }
    2232 GIC      736563 :                 nleft -= written;
    2233          736563 :                 from += written;
    2234          736563 :                 startoffset += written;
    2235          736563 :             } while (nleft > 0);
    2236 ECB             : 
    2237 CBC      736563 :             npages = 0;
    2238 ECB             : 
    2239                 :             /*
    2240                 :              * If we just wrote the whole last page of a logfile segment,
    2241                 :              * fsync the segment immediately.  This avoids having to go back
    2242                 :              * and re-open prior segments when an fsync request comes along
    2243                 :              * later. Doing it here ensures that one and only one backend will
    2244                 :              * perform this fsync.
    2245                 :              *
    2246                 :              * This is also the right place to notify the Archiver that the
    2247                 :              * segment is ready to copy to archival storage, and to update the
    2248                 :              * timer for archive_timeout, and to signal for a checkpoint if
    2249                 :              * too many logfile segments have been used since the last
    2250                 :              * checkpoint.
    2251                 :              */
    2252 GIC      736563 :             if (finishing_seg)
    2253                 :             {
    2254             692 :                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2255                 : 
    2256 ECB             :                 /* signal that we need to wakeup walsenders later */
    2257 GIC         692 :                 WalSndWakeupRequest();
    2258 ECB             : 
    2259 GIC         692 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
    2260                 : 
    2261 CBC         692 :                 if (XLogArchivingActive())
    2262 GIC          33 :                     XLogArchiveNotifySeg(openLogSegNo, tli);
    2263 ECB             : 
    2264 GIC         692 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    2265 CBC         692 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
    2266 ECB             : 
    2267                 :                 /*
    2268                 :                  * Request a checkpoint if we've consumed too much xlog since
    2269                 :                  * the last one.  For speed, we first check using the local
    2270                 :                  * copy of RedoRecPtr, which might be out of date; if it looks
    2271                 :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
    2272                 :                  * recheck.
    2273                 :                  */
    2274 GIC         692 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
    2275                 :                 {
    2276              40 :                     (void) GetRedoRecPtr();
    2277              40 :                     if (XLogCheckpointNeeded(openLogSegNo))
    2278 CBC          30 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
    2279                 :                 }
    2280 ECB             :             }
    2281                 :         }
    2282                 : 
    2283 GIC     1184781 :         if (ispartialpage)
    2284                 :         {
    2285                 :             /* Only asked to write a partial page */
    2286          294461 :             LogwrtResult.Write = WriteRqst.Write;
    2287 CBC      294461 :             break;
    2288                 :         }
    2289 GIC      890320 :         curridx = NextBufIdx(curridx);
    2290 ECB             : 
    2291                 :         /* If flexible, break out of loop as soon as we wrote something */
    2292 GIC      890320 :         if (flexible && npages == 0)
    2293 CBC        4372 :             break;
    2294                 :     }
    2295                 : 
    2296          731996 :     Assert(npages == 0);
    2297 ECB             : 
    2298                 :     /*
    2299                 :      * If asked to flush, do so
    2300                 :      */
    2301 GIC      731996 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
    2302          296291 :         LogwrtResult.Flush < LogwrtResult.Write)
    2303                 :     {
    2304                 :         /*
    2305 ECB             :          * Could get here without iterating above loop, in which case we might
    2306                 :          * have no open file or the wrong one.  However, we do not need to
    2307                 :          * fsync more than one file.
    2308                 :          */
    2309 GIC      296260 :         if (sync_method != SYNC_METHOD_OPEN &&
    2310          296260 :             sync_method != SYNC_METHOD_OPEN_DSYNC)
    2311                 :         {
    2312          296260 :             if (openLogFile >= 0 &&
    2313 CBC      296253 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2314 ECB             :                                  wal_segment_size))
    2315 UIC           0 :                 XLogFileClose();
    2316 CBC      296260 :             if (openLogFile < 0)
    2317 ECB             :             {
    2318 GIC           7 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
    2319 EUB             :                                 wal_segment_size);
    2320 CBC           7 :                 openLogTLI = tli;
    2321 GIC           7 :                 openLogFile = XLogFileOpen(openLogSegNo, tli);
    2322 CBC           7 :                 ReserveExternalFD();
    2323                 :             }
    2324 ECB             : 
    2325 CBC      296260 :             issue_xlog_fsync(openLogFile, openLogSegNo, tli);
    2326 ECB             :         }
    2327                 : 
    2328                 :         /* signal that we need to wakeup walsenders later */
    2329 CBC      296260 :         WalSndWakeupRequest();
    2330                 : 
    2331 GIC      296260 :         LogwrtResult.Flush = LogwrtResult.Write;
    2332                 :     }
    2333 ECB             : 
    2334                 :     /*
    2335                 :      * Update shared-memory status
    2336                 :      *
    2337                 :      * We make sure that the shared 'request' values do not fall behind the
    2338                 :      * 'result' values.  This is not absolutely essential, but it saves some
    2339                 :      * code in a couple of places.
    2340                 :      */
    2341                 :     {
    2342 GIC      731996 :         SpinLockAcquire(&XLogCtl->info_lck);
    2343          731996 :         XLogCtl->LogwrtResult = LogwrtResult;
    2344          731996 :         if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
    2345          285674 :             XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    2346 CBC      731996 :         if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
    2347          296653 :             XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    2348          731996 :         SpinLockRelease(&XLogCtl->info_lck);
    2349 ECB             :     }
    2350 CBC      731996 : }
    2351 ECB             : 
    2352                 : /*
    2353                 :  * Record the LSN for an asynchronous transaction commit/abort
    2354                 :  * and nudge the WALWriter if there is work for it to do.
    2355                 :  * (This should not be called for synchronous commits.)
    2356                 :  */
    2357                 : void
    2358 GIC       60542 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
    2359                 : {
    2360           60542 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    2361                 :     bool        sleeping;
    2362 ECB             : 
    2363 GIC       60542 :     SpinLockAcquire(&XLogCtl->info_lck);
    2364 CBC       60542 :     LogwrtResult = XLogCtl->LogwrtResult;
    2365 GIC       60542 :     sleeping = XLogCtl->WalWriterSleeping;
    2366           60542 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
    2367 CBC       60214 :         XLogCtl->asyncXactLSN = asyncXactLSN;
    2368           60542 :     SpinLockRelease(&XLogCtl->info_lck);
    2369 ECB             : 
    2370                 :     /*
    2371                 :      * If the WALWriter is sleeping, we should kick it to make it come out of
    2372                 :      * low-power mode.  Otherwise, determine whether there's a full page of
    2373                 :      * WAL available to write.
    2374                 :      */
    2375 GIC       60542 :     if (!sleeping)
    2376                 :     {
    2377                 :         /* back off to last completed page boundary */
    2378           60508 :         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
    2379 ECB             : 
    2380                 :         /* if we have already flushed that far, we're done */
    2381 GIC       60508 :         if (WriteRqstPtr <= LogwrtResult.Flush)
    2382 CBC       17598 :             return;
    2383                 :     }
    2384                 : 
    2385 ECB             :     /*
    2386                 :      * Nudge the WALWriter: it has a full page of WAL to write, or we want it
    2387                 :      * to come out of low-power mode so that this async commit will reach disk
    2388                 :      * within the expected amount of time.
    2389                 :      */
    2390 GIC       42944 :     if (ProcGlobal->walwriterLatch)
    2391            8097 :         SetLatch(ProcGlobal->walwriterLatch);
    2392                 : }
    2393                 : 
    2394 ECB             : /*
    2395                 :  * Record the LSN up to which we can remove WAL because it's not required by
    2396                 :  * any replication slot.
    2397                 :  */
    2398                 : void
    2399 GIC       19623 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
    2400                 : {
    2401           19623 :     SpinLockAcquire(&XLogCtl->info_lck);
    2402           19623 :     XLogCtl->replicationSlotMinLSN = lsn;
    2403 CBC       19623 :     SpinLockRelease(&XLogCtl->info_lck);
    2404 GIC       19623 : }
    2405 ECB             : 
    2406                 : 
    2407                 : /*
    2408                 :  * Return the oldest LSN we must retain to satisfy the needs of some
    2409                 :  * replication slot.
    2410                 :  */
    2411                 : static XLogRecPtr
    2412 GIC        2660 : XLogGetReplicationSlotMinimumLSN(void)
    2413                 : {
    2414                 :     XLogRecPtr  retval;
    2415                 : 
    2416 CBC        2660 :     SpinLockAcquire(&XLogCtl->info_lck);
    2417 GIC        2660 :     retval = XLogCtl->replicationSlotMinLSN;
    2418            2660 :     SpinLockRelease(&XLogCtl->info_lck);
    2419                 : 
    2420 CBC        2660 :     return retval;
    2421 ECB             : }
    2422                 : 
    2423                 : /*
    2424                 :  * Advance minRecoveryPoint in control file.
    2425                 :  *
    2426                 :  * If we crash during recovery, we must reach this point again before the
    2427                 :  * database is consistent.
    2428                 :  *
    2429                 :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
    2430                 :  * is only updated if it's not already greater than or equal to 'lsn'.
    2431                 :  */
    2432                 : static void
    2433 GIC       87149 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
    2434                 : {
    2435                 :     /* Quick check using our local copy of the variable */
    2436           87149 :     if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
    2437 CBC       80958 :         return;
    2438                 : 
    2439                 :     /*
    2440 ECB             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
    2441                 :      * i.e., we're doing crash recovery.  We never modify the control file's
    2442                 :      * value in that case, so we can short-circuit future checks here too. The
    2443                 :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
    2444                 :      * updated until crash recovery finishes.  We only do this for the startup
    2445                 :      * process as it should not update its own reference of minRecoveryPoint
    2446                 :      * until it has finished crash recovery to make sure that all WAL
    2447                 :      * available is replayed in this case.  This also saves from extra locks
    2448                 :      * taken on the control file from the startup process.
    2449                 :      */
    2450 GIC        6191 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
    2451                 :     {
    2452              25 :         updateMinRecoveryPoint = false;
    2453              25 :         return;
    2454 ECB             :     }
    2455                 : 
    2456 CBC        6166 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    2457 ECB             : 
    2458                 :     /* update local copy */
    2459 GIC        6166 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    2460 CBC        6166 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2461                 : 
    2462 GIC        6166 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
    2463 CBC           1 :         updateMinRecoveryPoint = false;
    2464            6165 :     else if (force || LocalMinRecoveryPoint < lsn)
    2465                 :     {
    2466 ECB             :         XLogRecPtr  newMinRecoveryPoint;
    2467                 :         TimeLineID  newMinRecoveryPointTLI;
    2468                 : 
    2469                 :         /*
    2470                 :          * To avoid having to update the control file too often, we update it
    2471                 :          * all the way to the last record being replayed, even though 'lsn'
    2472                 :          * would suffice for correctness.  This also allows the 'force' case
    2473                 :          * to not need a valid 'lsn' value.
    2474                 :          *
    2475                 :          * Another important reason for doing it this way is that the passed
    2476                 :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
    2477                 :          * the caller got it from a corrupted heap page.  Accepting such a
    2478                 :          * value as the min recovery point would prevent us from coming up at
    2479                 :          * all.  Instead, we just log a warning and continue with recovery.
    2480                 :          * (See also the comments about corrupt LSNs in XLogFlush.)
    2481                 :          */
    2482 GIC        5869 :         newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
    2483            5869 :         if (!force && newMinRecoveryPoint < lsn)
    2484 UIC           0 :             elog(WARNING,
    2485                 :                  "xlog min recovery request %X/%X is past current point %X/%X",
    2486 ECB             :                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
    2487                 : 
    2488 EUB             :         /* update control file */
    2489 GIC        5869 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
    2490                 :         {
    2491            5849 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    2492            5849 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
    2493 CBC        5849 :             UpdateControlFile();
    2494 GIC        5849 :             LocalMinRecoveryPoint = newMinRecoveryPoint;
    2495 CBC        5849 :             LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
    2496 ECB             : 
    2497 CBC        5849 :             ereport(DEBUG2,
    2498 ECB             :                     (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
    2499                 :                                      LSN_FORMAT_ARGS(newMinRecoveryPoint),
    2500                 :                                      newMinRecoveryPointTLI)));
    2501                 :         }
    2502                 :     }
    2503 GIC        6166 :     LWLockRelease(ControlFileLock);
    2504                 : }
    2505                 : 
    2506                 : /*
    2507 ECB             :  * Ensure that all XLOG data through the given position is flushed to disk.
    2508                 :  *
    2509                 :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
    2510                 :  * already held, and we try to avoid acquiring it if possible.
    2511                 :  */
    2512                 : void
    2513 GIC     1019619 : XLogFlush(XLogRecPtr record)
    2514                 : {
    2515                 :     XLogRecPtr  WriteRqstPtr;
    2516                 :     XLogwrtRqst WriteRqst;
    2517 CBC     1019619 :     TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
    2518                 : 
    2519                 :     /*
    2520                 :      * During REDO, we are reading not writing WAL.  Therefore, instead of
    2521 ECB             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
    2522                 :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
    2523                 :      * to act this way too, and because when it tries to write the
    2524                 :      * end-of-recovery checkpoint, it should indeed flush.
    2525                 :      */
    2526 GIC     1019619 :     if (!XLogInsertAllowed())
    2527                 :     {
    2528           87067 :         UpdateMinRecoveryPoint(record, false);
    2529          714330 :         return;
    2530 ECB             :     }
    2531                 : 
    2532                 :     /* Quick exit if already known flushed */
    2533 CBC      932552 :     if (record <= LogwrtResult.Flush)
    2534 GIC      627263 :         return;
    2535                 : 
    2536                 : #ifdef WAL_DEBUG
    2537 ECB             :     if (XLOG_DEBUG)
    2538                 :         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
    2539                 :              LSN_FORMAT_ARGS(record),
    2540                 :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    2541                 :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2542                 : #endif
    2543                 : 
    2544 GIC      305289 :     START_CRIT_SECTION();
    2545                 : 
    2546                 :     /*
    2547                 :      * Since fsync is usually a horribly expensive operation, we try to
    2548 ECB             :      * piggyback as much data as we can on each fsync: if we see any more data
    2549                 :      * entered into the xlog buffer, we'll write and fsync that too, so that
    2550                 :      * the final value of LogwrtResult.Flush is as large as possible. This
    2551                 :      * gives us some chance of avoiding another fsync immediately after.
    2552                 :      */
    2553                 : 
    2554                 :     /* initialize to given target; may increase below */
    2555 GIC      305289 :     WriteRqstPtr = record;
    2556                 : 
    2557                 :     /*
    2558                 :      * Now wait until we get the write lock, or someone else does the flush
    2559 ECB             :      * for us.
    2560                 :      */
    2561                 :     for (;;)
    2562 GIC         737 :     {
    2563                 :         XLogRecPtr  insertpos;
    2564                 : 
    2565                 :         /* read LogwrtResult and update local state */
    2566 CBC      306026 :         SpinLockAcquire(&XLogCtl->info_lck);
    2567 GIC      306026 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
    2568           13438 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
    2569          306026 :         LogwrtResult = XLogCtl->LogwrtResult;
    2570 CBC      306026 :         SpinLockRelease(&XLogCtl->info_lck);
    2571 ECB             : 
    2572                 :         /* done already? */
    2573 CBC      306026 :         if (record <= LogwrtResult.Flush)
    2574            9956 :             break;
    2575                 : 
    2576                 :         /*
    2577 ECB             :          * Before actually performing the write, wait for all in-flight
    2578                 :          * insertions to the pages we're about to write to finish.
    2579                 :          */
    2580 GIC      296070 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
    2581                 : 
    2582                 :         /*
    2583                 :          * Try to get the write lock. If we can't get it immediately, wait
    2584 ECB             :          * until it's released, and recheck if we still need to do the flush
    2585                 :          * or if the backend that held the lock did it for us already. This
    2586                 :          * helps to maintain a good rate of group committing when the system
    2587                 :          * is bottlenecked by the speed of fsyncing.
    2588                 :          */
    2589 GIC      296070 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
    2590                 :         {
    2591                 :             /*
    2592                 :              * The lock is now free, but we didn't acquire it yet. Before we
    2593 ECB             :              * do, loop back to check if someone else flushed the record for
    2594                 :              * us already.
    2595                 :              */
    2596 GIC         737 :             continue;
    2597                 :         }
    2598                 : 
    2599                 :         /* Got the lock; recheck whether request is satisfied */
    2600 CBC      295333 :         LogwrtResult = XLogCtl->LogwrtResult;
    2601 GIC      295333 :         if (record <= LogwrtResult.Flush)
    2602                 :         {
    2603             642 :             LWLockRelease(WALWriteLock);
    2604 CBC         642 :             break;
    2605 ECB             :         }
    2606                 : 
    2607                 :         /*
    2608                 :          * Sleep before flush! By adding a delay here, we may give further
    2609                 :          * backends the opportunity to join the backlog of group commit
    2610                 :          * followers; this can significantly improve transaction throughput,
    2611                 :          * at the risk of increasing transaction latency.
    2612                 :          *
    2613                 :          * We do not sleep if enableFsync is not turned on, nor if there are
    2614                 :          * fewer than CommitSiblings other backends with active transactions.
    2615                 :          */
    2616 GIC      294691 :         if (CommitDelay > 0 && enableFsync &&
    2617 UIC           0 :             MinimumActiveBackends(CommitSiblings))
    2618                 :         {
    2619               0 :             pg_usleep(CommitDelay);
    2620 ECB             : 
    2621 EUB             :             /*
    2622                 :              * Re-check how far we can now flush the WAL. It's generally not
    2623                 :              * safe to call WaitXLogInsertionsToFinish while holding
    2624                 :              * WALWriteLock, because an in-progress insertion might need to
    2625                 :              * also grab WALWriteLock to make progress. But we know that all
    2626                 :              * the insertions up to insertpos have already finished, because
    2627                 :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
    2628                 :              * We're only calling it again to allow insertpos to be moved
    2629                 :              * further forward, not to actually wait for anyone.
    2630                 :              */
    2631 UIC           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
    2632                 :         }
    2633                 : 
    2634                 :         /* try to write/flush later additions to XLOG as well */
    2635 GBC      294691 :         WriteRqst.Write = insertpos;
    2636 GIC      294691 :         WriteRqst.Flush = insertpos;
    2637                 : 
    2638          294691 :         XLogWrite(WriteRqst, insertTLI, false);
    2639 ECB             : 
    2640 CBC      294691 :         LWLockRelease(WALWriteLock);
    2641                 :         /* done */
    2642          294691 :         break;
    2643                 :     }
    2644 ECB             : 
    2645 GIC      305289 :     END_CRIT_SECTION();
    2646 ECB             : 
    2647                 :     /* wake up walsenders now that we've released heavily contended locks */
    2648 GNC      305289 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    2649 ECB             : 
    2650                 :     /*
    2651                 :      * If we still haven't flushed to the request point then we have a
    2652                 :      * problem; most likely, the requested flush point is past end of XLOG.
    2653                 :      * This has been seen to occur when a disk page has a corrupted LSN.
    2654                 :      *
    2655                 :      * Formerly we treated this as a PANIC condition, but that hurts the
    2656                 :      * system's robustness rather than helping it: we do not want to take down
    2657                 :      * the whole system due to corruption on one data page.  In particular, if
    2658                 :      * the bad page is encountered again during recovery then we would be
    2659                 :      * unable to restart the database at all!  (This scenario actually
    2660                 :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
    2661                 :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
    2662                 :      * the only time we can reach here during recovery is while flushing the
    2663                 :      * end-of-recovery checkpoint record, and we don't expect that to have a
    2664                 :      * bad LSN.
    2665                 :      *
    2666                 :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
    2667                 :      * since xact.c calls this routine inside a critical section.  However,
    2668                 :      * calls from bufmgr.c are not within critical sections and so we will not
    2669                 :      * force a restart for a bad LSN on a data page.
    2670                 :      */
    2671 GIC      305289 :     if (LogwrtResult.Flush < record)
    2672 UIC           0 :         elog(ERROR,
    2673                 :              "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
    2674                 :              LSN_FORMAT_ARGS(record),
    2675 ECB             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2676 EUB             : }
    2677                 : 
    2678                 : /*
    2679                 :  * Write & flush xlog, but without specifying exactly where to.
    2680                 :  *
    2681                 :  * We normally write only completed blocks; but if there is nothing to do on
    2682                 :  * that basis, we check for unwritten async commits in the current incomplete
    2683                 :  * block, and write through the latest one of those.  Thus, if async commits
    2684                 :  * are not being used, we will write complete blocks only.
    2685                 :  *
    2686                 :  * If, based on the above, there's anything to write we do so immediately. But
    2687                 :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
    2688                 :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
    2689                 :  * more than wal_writer_flush_after unflushed blocks.
    2690                 :  *
    2691                 :  * We can guarantee that async commits reach disk after at most three
    2692                 :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
    2693                 :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
    2694                 :  * this makes a difference only with very high load or long wal_writer_delay,
    2695                 :  * but imposes one extra cycle for the worst case for async commits.)
    2696                 :  *
    2697                 :  * This routine is invoked periodically by the background walwriter process.
    2698                 :  *
    2699                 :  * Returns true if there was any work to do, even if we skipped flushing due
    2700                 :  * to wal_writer_delay/wal_writer_flush_after.
    2701                 :  */
    2702                 : bool
    2703 GIC       14942 : XLogBackgroundFlush(void)
    2704                 : {
    2705                 :     XLogwrtRqst WriteRqst;
    2706           14942 :     bool        flexible = true;
    2707 ECB             :     static TimestampTz lastflush;
    2708                 :     TimestampTz now;
    2709                 :     int         flushbytes;
    2710                 :     TimeLineID  insertTLI;
    2711                 : 
    2712                 :     /* XLOG doesn't need flushing during recovery */
    2713 GIC       14942 :     if (RecoveryInProgress())
    2714               8 :         return false;
    2715                 : 
    2716                 :     /*
    2717 ECB             :      * Since we're not in recovery, InsertTimeLineID is set and can't change,
    2718                 :      * so we can read it without a lock.
    2719                 :      */
    2720 GIC       14934 :     insertTLI = XLogCtl->InsertTimeLineID;
    2721                 : 
    2722                 :     /* read LogwrtResult and update local state */
    2723           14934 :     SpinLockAcquire(&XLogCtl->info_lck);
    2724 CBC       14934 :     LogwrtResult = XLogCtl->LogwrtResult;
    2725 GIC       14934 :     WriteRqst = XLogCtl->LogwrtRqst;
    2726           14934 :     SpinLockRelease(&XLogCtl->info_lck);
    2727 ECB             : 
    2728                 :     /* back off to last completed page boundary */
    2729 CBC       14934 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
    2730 ECB             : 
    2731                 :     /* if we have already flushed that far, consider async commit records */
    2732 GIC       14934 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    2733 ECB             :     {
    2734 GIC        8201 :         SpinLockAcquire(&XLogCtl->info_lck);
    2735            8201 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
    2736 CBC        8201 :         SpinLockRelease(&XLogCtl->info_lck);
    2737 GIC        8201 :         flexible = false;       /* ensure it all gets written */
    2738 ECB             :     }
    2739                 : 
    2740                 :     /*
    2741                 :      * If already known flushed, we're done. Just need to check if we are
    2742                 :      * holding an open file handle to a logfile that's no longer in use,
    2743                 :      * preventing the file from being deleted.
    2744                 :      */
    2745 GIC       14934 :     if (WriteRqst.Write <= LogwrtResult.Flush)
    2746                 :     {
    2747            7725 :         if (openLogFile >= 0)
    2748                 :         {
    2749 CBC        4361 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
    2750                 :                                  wal_segment_size))
    2751 ECB             :             {
    2752 GIC          79 :                 XLogFileClose();
    2753 ECB             :             }
    2754                 :         }
    2755 GIC        7725 :         return false;
    2756 ECB             :     }
    2757                 : 
    2758                 :     /*
    2759                 :      * Determine how far to flush WAL, based on the wal_writer_delay and
    2760                 :      * wal_writer_flush_after GUCs.
    2761                 :      */
    2762 GIC        7209 :     now = GetCurrentTimestamp();
    2763            7209 :     flushbytes =
    2764            7209 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
    2765                 : 
    2766 CBC        7209 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
    2767 ECB             :     {
    2768                 :         /* first call, or block based limits disabled */
    2769 GIC         175 :         WriteRqst.Flush = WriteRqst.Write;
    2770 CBC         175 :         lastflush = now;
    2771                 :     }
    2772 GIC        7034 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
    2773 ECB             :     {
    2774                 :         /*
    2775                 :          * Flush the writes at least every WalWriterDelay ms. This is
    2776                 :          * important to bound the amount of time it takes for an asynchronous
    2777                 :          * commit to hit disk.
    2778                 :          */
    2779 GIC        1705 :         WriteRqst.Flush = WriteRqst.Write;
    2780            1705 :         lastflush = now;
    2781                 :     }
    2782            5329 :     else if (flushbytes >= WalWriterFlushAfter)
    2783 ECB             :     {
    2784                 :         /* exceeded wal_writer_flush_after blocks, flush */
    2785 GIC           3 :         WriteRqst.Flush = WriteRqst.Write;
    2786 CBC           3 :         lastflush = now;
    2787                 :     }
    2788                 :     else
    2789 ECB             :     {
    2790                 :         /* no flushing, this time round */
    2791 GIC        5326 :         WriteRqst.Flush = 0;
    2792                 :     }
    2793                 : 
    2794                 : #ifdef WAL_DEBUG
    2795 ECB             :     if (XLOG_DEBUG)
    2796                 :         elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
    2797                 :              LSN_FORMAT_ARGS(WriteRqst.Write),
    2798                 :              LSN_FORMAT_ARGS(WriteRqst.Flush),
    2799                 :              LSN_FORMAT_ARGS(LogwrtResult.Write),
    2800                 :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
    2801                 : #endif
    2802                 : 
    2803 GIC        7209 :     START_CRIT_SECTION();
    2804                 : 
    2805                 :     /* now wait for any in-progress insertions to finish and get write lock */
    2806            7209 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
    2807 CBC        7209 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    2808 GIC        7209 :     LogwrtResult = XLogCtl->LogwrtResult;
    2809            7209 :     if (WriteRqst.Write > LogwrtResult.Write ||
    2810 CBC        2366 :         WriteRqst.Flush > LogwrtResult.Flush)
    2811 ECB             :     {
    2812 CBC        4879 :         XLogWrite(WriteRqst, insertTLI, flexible);
    2813 ECB             :     }
    2814 CBC        7209 :     LWLockRelease(WALWriteLock);
    2815                 : 
    2816            7209 :     END_CRIT_SECTION();
    2817                 : 
    2818 ECB             :     /* wake up walsenders now that we've released heavily contended locks */
    2819 GNC        7209 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
    2820 ECB             : 
    2821                 :     /*
    2822                 :      * Great, done. To take some work off the critical path, try to initialize
    2823                 :      * as many of the no-longer-needed WAL buffers for future use as we can.
    2824                 :      */
    2825 GIC        7209 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
    2826                 : 
    2827                 :     /*
    2828                 :      * If we determined that we need to write data, but somebody else
    2829 ECB             :      * wrote/flushed already, it should be considered as being active, to
    2830                 :      * avoid hibernating too early.
    2831                 :      */
    2832 GIC        7209 :     return true;
    2833                 : }
    2834                 : 
    2835                 : /*
    2836 ECB             :  * Test whether XLOG data has been flushed up to (at least) the given position.
    2837                 :  *
    2838                 :  * Returns true if a flush is still needed.  (It may be that someone else
    2839                 :  * is already in process of flushing that far, however.)
    2840                 :  */
    2841                 : bool
    2842 GIC    14946151 : XLogNeedsFlush(XLogRecPtr record)
    2843                 : {
    2844                 :     /*
    2845                 :      * During recovery, we don't flush WAL but update minRecoveryPoint
    2846 ECB             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
    2847                 :      * would need to be updated.
    2848                 :      */
    2849 GIC    14946151 :     if (RecoveryInProgress())
    2850                 :     {
    2851                 :         /*
    2852                 :          * An invalid minRecoveryPoint means that we need to recover all the
    2853 ECB             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
    2854                 :          * file's value in that case, so we can short-circuit future checks
    2855                 :          * here too.  This triggers a quick exit path for the startup process,
    2856                 :          * which cannot update its local copy of minRecoveryPoint as long as
    2857                 :          * it has not replayed all WAL available when doing crash recovery.
    2858                 :          */
    2859 GIC     1407027 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
    2860 UIC           0 :             updateMinRecoveryPoint = false;
    2861                 : 
    2862                 :         /* Quick exit if already known to be updated or cannot be updated */
    2863 CBC     1407027 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    2864 GBC     1392862 :             return false;
    2865                 : 
    2866                 :         /*
    2867 ECB             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
    2868                 :          * just return a conservative guess.
    2869                 :          */
    2870 GIC       14165 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
    2871 UIC           0 :             return true;
    2872 GIC       14165 :         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    2873           14165 :         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    2874 CBC       14165 :         LWLockRelease(ControlFileLock);
    2875 EUB             : 
    2876 ECB             :         /*
    2877                 :          * Check minRecoveryPoint for any other process than the startup
    2878                 :          * process doing crash recovery, which should not update the control
    2879                 :          * file value if crash recovery is still running.
    2880                 :          */
    2881 GIC       14165 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
    2882 UIC           0 :             updateMinRecoveryPoint = false;
    2883                 : 
    2884                 :         /* check again */
    2885 CBC       14165 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
    2886 GBC          77 :             return false;
    2887                 :         else
    2888 GIC       14088 :             return true;
    2889 ECB             :     }
    2890                 : 
    2891                 :     /* Quick exit if already known flushed */
    2892 CBC    13539124 :     if (record <= LogwrtResult.Flush)
    2893 GIC    13431597 :         return false;
    2894                 : 
    2895                 :     /* read LogwrtResult and update local state */
    2896 CBC      107527 :     SpinLockAcquire(&XLogCtl->info_lck);
    2897          107527 :     LogwrtResult = XLogCtl->LogwrtResult;
    2898 GIC      107527 :     SpinLockRelease(&XLogCtl->info_lck);
    2899                 : 
    2900 ECB             :     /* check again */
    2901 CBC      107527 :     if (record <= LogwrtResult.Flush)
    2902            2195 :         return false;
    2903                 : 
    2904 GIC      105332 :     return true;
    2905 ECB             : }
    2906                 : 
    2907                 : /*
    2908                 :  * Try to make a given XLOG file segment exist.
    2909                 :  *
    2910                 :  * logsegno: identify segment.
    2911                 :  *
    2912                 :  * *added: on return, true if this call raised the number of extant segments.
    2913                 :  *
    2914                 :  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
    2915                 :  *
    2916                 :  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
    2917                 :  * wanting an open segment should attempt to open "path", which usually will
    2918                 :  * succeed.  (This is weird, but it's efficient for the callers.)
    2919                 :  */
    2920                 : static int
    2921 GIC        7666 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
    2922                 :                      bool *added, char *path)
    2923                 : {
    2924                 :     char        tmppath[MAXPGPATH];
    2925                 :     XLogSegNo   installed_segno;
    2926                 :     XLogSegNo   max_segno;
    2927                 :     int         fd;
    2928                 :     int         save_errno;
    2929 GNC        7666 :     int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
    2930                 : 
    2931 GIC        7666 :     Assert(logtli != 0);
    2932                 : 
    2933 CBC        7666 :     XLogFilePath(path, logtli, logsegno, wal_segment_size);
    2934                 : 
    2935 ECB             :     /*
    2936                 :      * Try to use existent file (checkpoint maker may have created it already)
    2937                 :      */
    2938 GIC        7666 :     *added = false;
    2939 GNC        7666 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    2940            7666 :                        get_sync_bit(sync_method));
    2941 GIC        7666 :     if (fd < 0)
    2942                 :     {
    2943 CBC         684 :         if (errno != ENOENT)
    2944 LBC           0 :             ereport(ERROR,
    2945 ECB             :                     (errcode_for_file_access(),
    2946                 :                      errmsg("could not open file \"%s\": %m", path)));
    2947                 :     }
    2948                 :     else
    2949 GBC        6982 :         return fd;
    2950                 : 
    2951                 :     /*
    2952                 :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
    2953                 :      * another process is doing the same thing.  If so, we will end up
    2954 ECB             :      * pre-creating an extra log segment.  That seems OK, and better than
    2955                 :      * holding the lock throughout this lengthy process.
    2956                 :      */
    2957 GIC         684 :     elog(DEBUG2, "creating and filling new WAL file");
    2958                 : 
    2959             684 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    2960                 : 
    2961             684 :     unlink(tmppath);
    2962 ECB             : 
    2963 GNC         684 :     if (io_direct_flags & IO_DIRECT_WAL_INIT)
    2964 UNC           0 :         open_flags |= PG_O_DIRECT;
    2965                 : 
    2966                 :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    2967 GNC         684 :     fd = BasicOpenFile(tmppath, open_flags);
    2968 GIC         684 :     if (fd < 0)
    2969 LBC           0 :         ereport(ERROR,
    2970                 :                 (errcode_for_file_access(),
    2971 ECB             :                  errmsg("could not create file \"%s\": %m", tmppath)));
    2972 EUB             : 
    2973 CBC         684 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    2974             684 :     save_errno = 0;
    2975 GBC         684 :     if (wal_init_zero)
    2976                 :     {
    2977                 :         ssize_t     rc;
    2978 ECB             : 
    2979                 :         /*
    2980                 :          * Zero-fill the file.  With this setting, we do this the hard way to
    2981                 :          * ensure that all the file space has really been allocated.  On
    2982                 :          * platforms that allow "holes" in files, just seeking to the end
    2983                 :          * doesn't allocate intermediate space.  This way, we know that we
    2984                 :          * have all the space and (after the fsync below) that all the
    2985                 :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
    2986                 :          * O_DSYNC will be sufficient to sync future writes to the log file.
    2987                 :          */
    2988 GNC         684 :         rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
    2989                 : 
    2990             684 :         if (rc < 0)
    2991 UNC           0 :             save_errno = errno;
    2992 ECB             :     }
    2993                 :     else
    2994                 :     {
    2995                 :         /*
    2996                 :          * Otherwise, seeking to the end and writing a solitary byte is
    2997                 :          * enough.
    2998                 :          */
    2999 UBC           0 :         errno = 0;
    3000 UNC           0 :         if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
    3001 EUB             :         {
    3002                 :             /* if write didn't set errno, assume no disk space */
    3003 UBC           0 :             save_errno = errno ? errno : ENOSPC;
    3004                 :         }
    3005 EUB             :     }
    3006 GIC         684 :     pgstat_report_wait_end();
    3007                 : 
    3008             684 :     if (save_errno)
    3009                 :     {
    3010 ECB             :         /*
    3011                 :          * If we fail to make the file, delete it to release disk space
    3012                 :          */
    3013 UBC           0 :         unlink(tmppath);
    3014 EUB             : 
    3015 UBC           0 :         close(fd);
    3016 EUB             : 
    3017 UIC           0 :         errno = save_errno;
    3018                 : 
    3019               0 :         ereport(ERROR,
    3020 ECB             :                 (errcode_for_file_access(),
    3021                 :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    3022                 :     }
    3023 EUB             : 
    3024 GIC         684 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    3025             684 :     if (pg_fsync(fd) != 0)
    3026                 :     {
    3027 UNC           0 :         save_errno = errno;
    3028 UIC           0 :         close(fd);
    3029               0 :         errno = save_errno;
    3030               0 :         ereport(ERROR,
    3031 ECB             :                 (errcode_for_file_access(),
    3032                 :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3033                 :     }
    3034 GIC         684 :     pgstat_report_wait_end();
    3035                 : 
    3036             684 :     if (close(fd) != 0)
    3037 UIC           0 :         ereport(ERROR,
    3038                 :                 (errcode_for_file_access(),
    3039                 :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3040                 : 
    3041                 :     /*
    3042 ECB             :      * Now move the segment into place with its final name.  Cope with
    3043                 :      * possibility that someone else has created the file while we were
    3044                 :      * filling ours: if so, use ours to pre-create a future log segment.
    3045                 :      */
    3046 CBC         684 :     installed_segno = logsegno;
    3047 ECB             : 
    3048                 :     /*
    3049                 :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
    3050                 :      * that was a constant, but that was always a bit dubious: normally, at a
    3051                 :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
    3052                 :      * here, it was the offset from the insert location. We can't do the
    3053                 :      * normal XLOGfileslop calculation here because we don't have access to
    3054                 :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
    3055                 :      * CheckPointSegments.
    3056 EUB             :      */
    3057 GBC         684 :     max_segno = logsegno + CheckPointSegments;
    3058 GIC         684 :     if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
    3059                 :                                logtli))
    3060 ECB             :     {
    3061 GIC         684 :         *added = true;
    3062             684 :         elog(DEBUG2, "done creating and filling new WAL file");
    3063                 :     }
    3064                 :     else
    3065                 :     {
    3066                 :         /*
    3067                 :          * No need for any more future segments, or InstallXLogFileSegment()
    3068                 :          * failed to rename the file into place. If the rename failed, a
    3069                 :          * caller opening the file may fail.
    3070                 :          */
    3071 UIC           0 :         unlink(tmppath);
    3072               0 :         elog(DEBUG2, "abandoned new WAL file");
    3073                 :     }
    3074                 : 
    3075 GIC         684 :     return -1;
    3076 ECB             : }
    3077                 : 
    3078                 : /*
    3079                 :  * Create a new XLOG file segment, or open a pre-existing one.
    3080                 :  *
    3081                 :  * logsegno: identify segment to be created/opened.
    3082                 :  *
    3083                 :  * Returns FD of opened file.
    3084                 :  *
    3085                 :  * Note: errors here are ERROR not PANIC because we might or might not be
    3086                 :  * inside a critical section (eg, during checkpoint there is no reason to
    3087                 :  * take down the system on failure).  They will promote to PANIC if we are
    3088                 :  * in a critical section.
    3089                 :  */
    3090                 : int
    3091 CBC        7624 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
    3092 EUB             : {
    3093                 :     bool        ignore_added;
    3094                 :     char        path[MAXPGPATH];
    3095 ECB             :     int         fd;
    3096                 : 
    3097 GIC        7624 :     Assert(logtli != 0);
    3098                 : 
    3099            7624 :     fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
    3100            7624 :     if (fd >= 0)
    3101            6949 :         return fd;
    3102                 : 
    3103                 :     /* Now open original target segment (might not be file I just made) */
    3104 GNC         675 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3105             675 :                        get_sync_bit(sync_method));
    3106 GIC         675 :     if (fd < 0)
    3107 UIC           0 :         ereport(ERROR,
    3108                 :                 (errcode_for_file_access(),
    3109                 :                  errmsg("could not open file \"%s\": %m", path)));
    3110 GIC         675 :     return fd;
    3111                 : }
    3112                 : 
    3113                 : /*
    3114                 :  * Create a new XLOG file segment by copying a pre-existing one.
    3115 ECB             :  *
    3116                 :  * destsegno: identify segment to be created.
    3117                 :  *
    3118                 :  * srcTLI, srcsegno: identify segment to be copied (could be from
    3119                 :  *      a different timeline)
    3120                 :  *
    3121                 :  * upto: how much of the source file to copy (the rest is filled with
    3122                 :  *      zeros)
    3123                 :  *
    3124                 :  * Currently this is only used during recovery, and so there are no locking
    3125                 :  * considerations.  But we should be just as tense as XLogFileInit to avoid
    3126                 :  * emplacing a bogus file.
    3127                 :  */
    3128                 : static void
    3129 CBC          30 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
    3130 ECB             :              TimeLineID srcTLI, XLogSegNo srcsegno,
    3131                 :              int upto)
    3132 EUB             : {
    3133                 :     char        path[MAXPGPATH];
    3134                 :     char        tmppath[MAXPGPATH];
    3135                 :     PGAlignedXLogBlock buffer;
    3136                 :     int         srcfd;
    3137                 :     int         fd;
    3138                 :     int         nbytes;
    3139 ECB             : 
    3140                 :     /*
    3141                 :      * Open the source file
    3142                 :      */
    3143 GIC          30 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
    3144 CBC          30 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    3145              30 :     if (srcfd < 0)
    3146 UBC           0 :         ereport(ERROR,
    3147                 :                 (errcode_for_file_access(),
    3148                 :                  errmsg("could not open file \"%s\": %m", path)));
    3149                 : 
    3150                 :     /*
    3151                 :      * Copy into a temp file name.
    3152                 :      */
    3153 CBC          30 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
    3154                 : 
    3155 GIC          30 :     unlink(tmppath);
    3156                 : 
    3157 ECB             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    3158 GIC          30 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3159              30 :     if (fd < 0)
    3160 UIC           0 :         ereport(ERROR,
    3161                 :                 (errcode_for_file_access(),
    3162                 :                  errmsg("could not create file \"%s\": %m", tmppath)));
    3163 ECB             : 
    3164                 :     /*
    3165                 :      * Do the data copying.
    3166                 :      */
    3167 GIC       61470 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
    3168                 :     {
    3169                 :         int         nread;
    3170 ECB             : 
    3171 CBC       61440 :         nread = upto - nbytes;
    3172 ECB             : 
    3173                 :         /*
    3174                 :          * The part that is not read from the source file is filled with
    3175                 :          * zeros.
    3176 EUB             :          */
    3177 GBC       61440 :         if (nread < sizeof(buffer))
    3178 GIC          30 :             memset(buffer.data, 0, sizeof(buffer));
    3179                 : 
    3180           61440 :         if (nread > 0)
    3181                 :         {
    3182 EUB             :             int         r;
    3183                 : 
    3184 GIC        2603 :             if (nread > sizeof(buffer))
    3185            2573 :                 nread = sizeof(buffer);
    3186            2603 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
    3187 CBC        2603 :             r = read(srcfd, buffer.data, nread);
    3188 GIC        2603 :             if (r != nread)
    3189 ECB             :             {
    3190 LBC           0 :                 if (r < 0)
    3191               0 :                     ereport(ERROR,
    3192                 :                             (errcode_for_file_access(),
    3193 EUB             :                              errmsg("could not read file \"%s\": %m",
    3194                 :                                     path)));
    3195                 :                 else
    3196 UIC           0 :                     ereport(ERROR,
    3197                 :                             (errcode(ERRCODE_DATA_CORRUPTED),
    3198 EUB             :                              errmsg("could not read file \"%s\": read %d of %zu",
    3199                 :                                     path, r, (Size) nread)));
    3200                 :             }
    3201 GIC        2603 :             pgstat_report_wait_end();
    3202 EUB             :         }
    3203 GIC       61440 :         errno = 0;
    3204           61440 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
    3205           61440 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
    3206 ECB             :         {
    3207 UIC           0 :             int         save_errno = errno;
    3208                 : 
    3209 ECB             :             /*
    3210                 :              * If we fail to make the file, delete it to release disk space
    3211 EUB             :              */
    3212 UIC           0 :             unlink(tmppath);
    3213                 :             /* if write didn't set errno, assume problem is no disk space */
    3214 LBC           0 :             errno = save_errno ? save_errno : ENOSPC;
    3215                 : 
    3216               0 :             ereport(ERROR,
    3217 EUB             :                     (errcode_for_file_access(),
    3218                 :                      errmsg("could not write to file \"%s\": %m", tmppath)));
    3219                 :         }
    3220 GIC       61440 :         pgstat_report_wait_end();
    3221 ECB             :     }
    3222 EUB             : 
    3223 GIC          30 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
    3224              30 :     if (pg_fsync(fd) != 0)
    3225 UIC           0 :         ereport(data_sync_elevel(ERROR),
    3226                 :                 (errcode_for_file_access(),
    3227                 :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    3228 GIC          30 :     pgstat_report_wait_end();
    3229 ECB             : 
    3230 GBC          30 :     if (CloseTransientFile(fd) != 0)
    3231 LBC           0 :         ereport(ERROR,
    3232                 :                 (errcode_for_file_access(),
    3233                 :                  errmsg("could not close file \"%s\": %m", tmppath)));
    3234                 : 
    3235 GIC          30 :     if (CloseTransientFile(srcfd) != 0)
    3236 UIC           0 :         ereport(ERROR,
    3237                 :                 (errcode_for_file_access(),
    3238                 :                  errmsg("could not close file \"%s\": %m", path)));
    3239                 : 
    3240                 :     /*
    3241                 :      * Now move the segment into place with its final name.
    3242                 :      */
    3243 GIC          30 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
    3244 UIC           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
    3245 GIC          30 : }
    3246                 : 
    3247                 : /*
    3248                 :  * Install a new XLOG segment file as a current or future log segment.
    3249                 :  *
    3250                 :  * This is used both to install a newly-created segment (which has a temp
    3251                 :  * filename while it's being created) and to recycle an old segment.
    3252                 :  *
    3253                 :  * *segno: identify segment to install as (or first possible target).
    3254                 :  * When find_free is true, this is modified on return to indicate the
    3255                 :  * actual installation location or last segment searched.
    3256                 :  *
    3257                 :  * tmppath: initial name of file to install.  It will be renamed into place.
    3258                 :  *
    3259                 :  * find_free: if true, install the new segment at the first empty segno
    3260 ECB             :  * number at or after the passed numbers.  If false, install the new segment
    3261                 :  * exactly where specified, deleting any existing segment file there.
    3262                 :  *
    3263                 :  * max_segno: maximum segment number to install the new file as.  Fail if no
    3264                 :  * free slot is found between *segno and max_segno. (Ignored when find_free
    3265                 :  * is false.)
    3266                 :  *
    3267                 :  * tli: The timeline on which the new segment should be installed.
    3268                 :  *
    3269                 :  * Returns true if the file was installed successfully.  false indicates that
    3270                 :  * max_segno limit was exceeded, the startup process has disabled this
    3271                 :  * function for now, or an error occurred while renaming the file into place.
    3272                 :  */
    3273 EUB             : static bool
    3274 GBC        1267 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
    3275                 :                        bool find_free, XLogSegNo max_segno, TimeLineID tli)
    3276                 : {
    3277 ECB             :     char        path[MAXPGPATH];
    3278                 :     struct stat stat_buf;
    3279                 : 
    3280 CBC        1267 :     Assert(tli != 0);
    3281                 : 
    3282 GIC        1267 :     XLogFilePath(path, tli, *segno, wal_segment_size);
    3283                 : 
    3284            1267 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    3285 CBC        1267 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3286                 :     {
    3287 LBC           0 :         LWLockRelease(ControlFileLock);
    3288 UIC           0 :         return false;
    3289                 :     }
    3290 ECB             : 
    3291 CBC        1267 :     if (!find_free)
    3292                 :     {
    3293 ECB             :         /* Force installation: get rid of any pre-existing segment file */
    3294 CBC          30 :         durable_unlink(path, DEBUG1);
    3295                 :     }
    3296                 :     else
    3297                 :     {
    3298 ECB             :         /* Find a free slot to put it in */
    3299 CBC        1643 :         while (stat(path, &stat_buf) == 0)
    3300                 :         {
    3301 GBC         426 :             if ((*segno) >= max_segno)
    3302                 :             {
    3303 EUB             :                 /* Failed to find a free slot within specified range */
    3304 GIC          20 :                 LWLockRelease(ControlFileLock);
    3305              20 :                 return false;
    3306 ECB             :             }
    3307 GIC         406 :             (*segno)++;
    3308 CBC         406 :             XLogFilePath(path, tli, *segno, wal_segment_size);
    3309                 :         }
    3310                 :     }
    3311                 : 
    3312 GNC        1247 :     Assert(access(path, F_OK) != 0 && errno == ENOENT);
    3313            1247 :     if (durable_rename(tmppath, path, LOG) != 0)
    3314                 :     {
    3315 UIC           0 :         LWLockRelease(ControlFileLock);
    3316                 :         /* durable_rename already emitted log message */
    3317 LBC           0 :         return false;
    3318                 :     }
    3319 ECB             : 
    3320 CBC        1247 :     LWLockRelease(ControlFileLock);
    3321 ECB             : 
    3322 GBC        1247 :     return true;
    3323                 : }
    3324                 : 
    3325                 : /*
    3326 ECB             :  * Open a pre-existing logfile segment for writing.
    3327                 :  */
    3328                 : int
    3329 GIC           7 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
    3330                 : {
    3331                 :     char        path[MAXPGPATH];
    3332                 :     int         fd;
    3333 ECB             : 
    3334 GIC           7 :     XLogFilePath(path, tli, segno, wal_segment_size);
    3335 ECB             : 
    3336 GNC           7 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
    3337               7 :                        get_sync_bit(sync_method));
    3338 GIC           7 :     if (fd < 0)
    3339 UIC           0 :         ereport(PANIC,
    3340                 :                 (errcode_for_file_access(),
    3341                 :                  errmsg("could not open file \"%s\": %m", path)));
    3342                 : 
    3343 GIC           7 :     return fd;
    3344                 : }
    3345 ECB             : 
    3346                 : /*
    3347                 :  * Close the current logfile segment for writing.
    3348                 :  */
    3349                 : static void
    3350 GIC        1705 : XLogFileClose(void)
    3351                 : {
    3352 GBC        1705 :     Assert(openLogFile >= 0);
    3353                 : 
    3354 EUB             :     /*
    3355                 :      * WAL segment files will not be re-read in normal operation, so we advise
    3356                 :      * the OS to release any cached pages.  But do not do so if WAL archiving
    3357                 :      * or streaming is active, because archiver and walsender process could
    3358                 :      * use the cache to read the WAL segment.
    3359                 :      */
    3360                 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    3361 GNC        1705 :     if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
    3362 CBC        1257 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    3363 ECB             : #endif
    3364                 : 
    3365 GIC        1705 :     if (close(openLogFile) != 0)
    3366                 :     {
    3367                 :         char        xlogfname[MAXFNAMELEN];
    3368 UIC           0 :         int         save_errno = errno;
    3369                 : 
    3370               0 :         XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
    3371               0 :         errno = save_errno;
    3372               0 :         ereport(PANIC,
    3373                 :                 (errcode_for_file_access(),
    3374                 :                  errmsg("could not close file \"%s\": %m", xlogfname)));
    3375                 :     }
    3376                 : 
    3377 GIC        1705 :     openLogFile = -1;
    3378            1705 :     ReleaseExternalFD();
    3379            1705 : }
    3380                 : 
    3381                 : /*
    3382                 :  * Preallocate log files beyond the specified log endpoint.
    3383                 :  *
    3384                 :  * XXX this is currently extremely conservative, since it forces only one
    3385 ECB             :  * future log segment to exist, and even that only if we are 75% done with
    3386                 :  * the current one.  This is only appropriate for very low-WAL-volume systems.
    3387                 :  * High-volume systems will be OK once they've built up a sufficient set of
    3388                 :  * recycled log segments, but the startup transient is likely to include
    3389                 :  * a lot of segment creations by foreground processes, which is not so good.
    3390                 :  *
    3391                 :  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
    3392                 :  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
    3393                 :  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
    3394                 :  * initiated the present checkpoint and an ERROR ends this function, the
    3395                 :  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
    3396                 :  * not worth contorting more functions to use caller-specified elevel values.
    3397                 :  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
    3398                 :  * reporting and resource reclamation.)
    3399                 :  */
    3400                 : static void
    3401 CBC        2538 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
    3402 ECB             : {
    3403                 :     XLogSegNo   _logSegNo;
    3404                 :     int         lf;
    3405                 :     bool        added;
    3406                 :     char        path[MAXPGPATH];
    3407                 :     uint64      offset;
    3408                 : 
    3409 GIC        2538 :     if (!XLogCtl->InstallXLogFileSegmentActive)
    3410               4 :         return;                 /* unlocked check says no */
    3411                 : 
    3412            2534 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
    3413            2534 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
    3414            2534 :     if (offset >= (uint32) (0.75 * wal_segment_size))
    3415                 :     {
    3416              42 :         _logSegNo++;
    3417              42 :         lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
    3418              42 :         if (lf >= 0)
    3419              33 :             close(lf);
    3420              42 :         if (added)
    3421               9 :             CheckpointStats.ckpt_segs_added++;
    3422 ECB             :     }
    3423                 : }
    3424                 : 
    3425                 : /*
    3426                 :  * Throws an error if the given log segment has already been removed or
    3427                 :  * recycled. The caller should only pass a segment that it knows to have
    3428                 :  * existed while the server has been running, as this function always
    3429                 :  * succeeds if no WAL segments have been removed since startup.
    3430                 :  * 'tli' is only used in the error message.
    3431                 :  *
    3432                 :  * Note: this function guarantees to keep errno unchanged on return.
    3433                 :  * This supports callers that use this to possibly deliver a better
    3434                 :  * error message about a missing file, while still being able to throw
    3435 EUB             :  * a normal file-access error afterwards, if this does return.
    3436                 :  */
    3437                 : void
    3438 GIC       47944 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
    3439                 : {
    3440           47944 :     int         save_errno = errno;
    3441                 :     XLogSegNo   lastRemovedSegNo;
    3442 ECB             : 
    3443 CBC       47944 :     SpinLockAcquire(&XLogCtl->info_lck);
    3444 GIC       47944 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3445           47944 :     SpinLockRelease(&XLogCtl->info_lck);
    3446                 : 
    3447           47944 :     if (segno <= lastRemovedSegNo)
    3448                 :     {
    3449                 :         char        filename[MAXFNAMELEN];
    3450                 : 
    3451 UIC           0 :         XLogFileName(filename, tli, segno, wal_segment_size);
    3452               0 :         errno = save_errno;
    3453 LBC           0 :         ereport(ERROR,
    3454                 :                 (errcode_for_file_access(),
    3455                 :                  errmsg("requested WAL segment %s has already been removed",
    3456                 :                         filename)));
    3457 ECB             :     }
    3458 CBC       47944 :     errno = save_errno;
    3459           47944 : }
    3460                 : 
    3461 ECB             : /*
    3462                 :  * Return the last WAL segment removed, or 0 if no segment has been removed
    3463                 :  * since startup.
    3464                 :  *
    3465                 :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
    3466                 :  * with that.
    3467                 :  */
    3468                 : XLogSegNo
    3469 GIC         745 : XLogGetLastRemovedSegno(void)
    3470 ECB             : {
    3471                 :     XLogSegNo   lastRemovedSegNo;
    3472                 : 
    3473 GIC         745 :     SpinLockAcquire(&XLogCtl->info_lck);
    3474             745 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
    3475 CBC         745 :     SpinLockRelease(&XLogCtl->info_lck);
    3476                 : 
    3477             745 :     return lastRemovedSegNo;
    3478 ECB             : }
    3479                 : 
    3480                 : 
    3481                 : /*
    3482                 :  * Update the last removed segno pointer in shared memory, to reflect that the
    3483                 :  * given XLOG file has been removed.
    3484                 :  */
    3485                 : static void
    3486 GIC         573 : UpdateLastRemovedPtr(char *filename)
    3487                 : {
    3488                 :     uint32      tli;
    3489                 :     XLogSegNo   segno;
    3490 ECB             : 
    3491 GIC         573 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
    3492                 : 
    3493             573 :     SpinLockAcquire(&XLogCtl->info_lck);
    3494             573 :     if (segno > XLogCtl->lastRemovedSegNo)
    3495 CBC         225 :         XLogCtl->lastRemovedSegNo = segno;
    3496 GIC         573 :     SpinLockRelease(&XLogCtl->info_lck);
    3497 CBC         573 : }
    3498 ECB             : 
    3499                 : /*
    3500                 :  * Remove all temporary log files in pg_wal
    3501                 :  *
    3502                 :  * This is called at the beginning of recovery after a previous crash,
    3503                 :  * at a point where no other processes write fresh WAL data.
    3504                 :  */
    3505 EUB             : static void
    3506 GBC         131 : RemoveTempXlogFiles(void)
    3507 EUB             : {
    3508                 :     DIR        *xldir;
    3509 ECB             :     struct dirent *xlde;
    3510                 : 
    3511 GIC         131 :     elog(DEBUG2, "removing all temporary WAL segments");
    3512                 : 
    3513             131 :     xldir = AllocateDir(XLOGDIR);
    3514             709 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3515                 :     {
    3516                 :         char        path[MAXPGPATH];
    3517                 : 
    3518             578 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
    3519             578 :             continue;
    3520                 : 
    3521 UIC           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
    3522               0 :         unlink(path);
    3523 LBC           0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
    3524                 :     }
    3525 GIC         131 :     FreeDir(xldir);
    3526             131 : }
    3527                 : 
    3528                 : /*
    3529                 :  * Recycle or remove all log files older or equal to passed segno.
    3530                 :  *
    3531                 :  * endptr is current (or recent) end of xlog, and lastredoptr is the
    3532                 :  * redo pointer of the last checkpoint. These are used to determine
    3533 ECB             :  * whether we want to recycle rather than delete no-longer-wanted log files.
    3534                 :  *
    3535                 :  * insertTLI is the current timeline for XLOG insertion. Any recycled
    3536                 :  * segments should be reused for this timeline.
    3537                 :  */
    3538                 : static void
    3539 GIC        2363 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
    3540                 :                    TimeLineID insertTLI)
    3541 ECB             : {
    3542                 :     DIR        *xldir;
    3543                 :     struct dirent *xlde;
    3544                 :     char        lastoff[MAXFNAMELEN];
    3545                 :     XLogSegNo   endlogSegNo;
    3546                 :     XLogSegNo   recycleSegNo;
    3547                 : 
    3548                 :     /* Initialize info about where to try to recycle to */
    3549 GIC        2363 :     XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
    3550            2363 :     recycleSegNo = XLOGfileslop(lastredoptr);
    3551 ECB             : 
    3552                 :     /*
    3553                 :      * Construct a filename of the last segment to be kept. The timeline ID
    3554                 :      * doesn't matter, we ignore that in the comparison. (During recovery,
    3555                 :      * InsertTimeLineID isn't set, so we can't use that.)
    3556                 :      */
    3557 GIC        2363 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
    3558                 : 
    3559            2363 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
    3560                 :          lastoff);
    3561                 : 
    3562            2363 :     xldir = AllocateDir(XLOGDIR);
    3563                 : 
    3564           13798 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3565                 :     {
    3566 ECB             :         /* Ignore files that are not XLOG segments */
    3567 GIC       11435 :         if (!IsXLogFileName(xlde->d_name) &&
    3568 CBC        7147 :             !IsPartialXLogFileName(xlde->d_name))
    3569 GIC        7145 :             continue;
    3570                 : 
    3571 ECB             :         /*
    3572                 :          * We ignore the timeline part of the XLOG segment identifiers in
    3573                 :          * deciding whether a segment is still needed.  This ensures that we
    3574                 :          * won't prematurely remove a segment from a parent timeline. We could
    3575                 :          * probably be a little more proactive about removing segments of
    3576                 :          * non-parent timelines, but that would be a whole lot more
    3577                 :          * complicated.
    3578                 :          *
    3579                 :          * We use the alphanumeric sorting property of the filenames to decide
    3580                 :          * which ones are earlier than the lastoff segment.
    3581                 :          */
    3582 GIC        4290 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
    3583                 :         {
    3584             579 :             if (XLogArchiveCheckDone(xlde->d_name))
    3585                 :             {
    3586                 :                 /* Update the last removed location in shared memory first */
    3587             573 :                 UpdateLastRemovedPtr(xlde->d_name);
    3588                 : 
    3589 GNC         573 :                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
    3590                 :             }
    3591                 :         }
    3592                 :     }
    3593                 : 
    3594 GIC        2363 :     FreeDir(xldir);
    3595            2363 : }
    3596 ECB             : 
    3597                 : /*
    3598                 :  * Remove WAL files that are not part of the given timeline's history.
    3599                 :  *
    3600                 :  * This is called during recovery, whenever we switch to follow a new
    3601                 :  * timeline, and at the end of recovery when we create a new timeline. We
    3602                 :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
    3603                 :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
    3604                 :  * that we haven't used yet, and contain garbage. If we just leave them in
    3605                 :  * pg_wal, they will eventually be archived, and we can't let that happen.
    3606                 :  * Files that belong to our timeline history are valid, because we have
    3607                 :  * successfully replayed them, but from others we can't be sure.
    3608                 :  *
    3609                 :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
    3610                 :  * and 'newTLI' is the new timeline we switch to.
    3611                 :  */
    3612                 : void
    3613 GIC          48 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
    3614                 : {
    3615                 :     DIR        *xldir;
    3616 ECB             :     struct dirent *xlde;
    3617                 :     char        switchseg[MAXFNAMELEN];
    3618                 :     XLogSegNo   endLogSegNo;
    3619                 :     XLogSegNo   switchLogSegNo;
    3620                 :     XLogSegNo   recycleSegNo;
    3621                 : 
    3622                 :     /*
    3623                 :      * Initialize info about where to begin the work.  This will recycle,
    3624                 :      * somewhat arbitrarily, 10 future segments.
    3625                 :      */
    3626 CBC          48 :     XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
    3627              48 :     XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
    3628 GIC          48 :     recycleSegNo = endLogSegNo + 10;
    3629                 : 
    3630                 :     /*
    3631                 :      * Construct a filename of the last segment to be kept.
    3632                 :      */
    3633              48 :     XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
    3634 ECB             : 
    3635 CBC          48 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
    3636                 :          switchseg);
    3637                 : 
    3638 GIC          48 :     xldir = AllocateDir(XLOGDIR);
    3639                 : 
    3640             408 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3641                 :     {
    3642                 :         /* Ignore files that are not XLOG segments */
    3643 CBC         360 :         if (!IsXLogFileName(xlde->d_name))
    3644             208 :             continue;
    3645                 : 
    3646                 :         /*
    3647                 :          * Remove files that are on a timeline older than the new one we're
    3648 ECB             :          * switching to, but with a segment number >= the first segment on the
    3649                 :          * new timeline.
    3650                 :          */
    3651 GIC         152 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
    3652              98 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
    3653                 :         {
    3654                 :             /*
    3655                 :              * If the file has already been marked as .ready, however, don't
    3656                 :              * remove it yet. It should be OK to remove it - files that are
    3657                 :              * not part of our timeline history are not required for recovery
    3658                 :              * - but seems safer to let them be archived and removed later.
    3659                 :              */
    3660              12 :             if (!XLogArchiveIsReady(xlde->d_name))
    3661 GNC          12 :                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
    3662                 :         }
    3663                 :     }
    3664 ECB             : 
    3665 GIC          48 :     FreeDir(xldir);
    3666              48 : }
    3667                 : 
    3668                 : /*
    3669                 :  * Recycle or remove a log file that's no longer needed.
    3670                 :  *
    3671                 :  * segment_de is the dirent structure of the segment to recycle or remove.
    3672                 :  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
    3673                 :  * the segment number of the current (or recent) end of WAL.
    3674 ECB             :  *
    3675                 :  * endlogSegNo gets incremented if the segment is recycled so as it is not
    3676                 :  * checked again with future callers of this function.
    3677                 :  *
    3678                 :  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
    3679                 :  * should be used for this timeline.
    3680                 :  */
    3681                 : static void
    3682 GNC         585 : RemoveXlogFile(const struct dirent *segment_de,
    3683                 :                XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
    3684                 :                TimeLineID insertTLI)
    3685 ECB             : {
    3686                 :     char        path[MAXPGPATH];
    3687                 : #ifdef WIN32
    3688                 :     char        newpath[MAXPGPATH];
    3689                 : #endif
    3690 GNC         585 :     const char *segname = segment_de->d_name;
    3691                 : 
    3692 CBC         585 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
    3693                 : 
    3694 ECB             :     /*
    3695                 :      * Before deleting the file, see if it can be recycled as a future log
    3696                 :      * segment. Only recycle normal files, because we don't want to recycle
    3697                 :      * symbolic links pointing to a separate archive directory.
    3698                 :      */
    3699 GIC         585 :     if (wal_recycle &&
    3700             585 :         *endlogSegNo <= recycleSegNo &&
    3701 CBC        1111 :         XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
    3702 GNC        1106 :         get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
    3703 GIC         553 :         InstallXLogFileSegment(endlogSegNo, path,
    3704                 :                                true, recycleSegNo, insertTLI))
    3705                 :     {
    3706             533 :         ereport(DEBUG2,
    3707                 :                 (errmsg_internal("recycled write-ahead log file \"%s\"",
    3708                 :                                  segname)));
    3709             533 :         CheckpointStats.ckpt_segs_recycled++;
    3710                 :         /* Needn't recheck that slot on future iterations */
    3711             533 :         (*endlogSegNo)++;
    3712                 :     }
    3713                 :     else
    3714                 :     {
    3715                 :         /* No need for any more future segments, or recycling failed ... */
    3716                 :         int         rc;
    3717                 : 
    3718              52 :         ereport(DEBUG2,
    3719                 :                 (errmsg_internal("removing write-ahead log file \"%s\"",
    3720                 :                                  segname)));
    3721                 : 
    3722                 : #ifdef WIN32
    3723                 : 
    3724                 :         /*
    3725                 :          * On Windows, if another process (e.g another backend) holds the file
    3726                 :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
    3727                 :          * will still show up in directory listing until the last handle is
    3728 ECB             :          * closed. To avoid confusing the lingering deleted file for a live
    3729                 :          * WAL file that needs to be archived, rename it before deleting it.
    3730                 :          *
    3731                 :          * If another process holds the file open without FILE_SHARE_DELETE
    3732                 :          * flag, rename will fail. We'll try again at the next checkpoint.
    3733 EUB             :          */
    3734                 :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
    3735 ECB             :         if (rename(path, newpath) != 0)
    3736                 :         {
    3737                 :             ereport(LOG,
    3738                 :                     (errcode_for_file_access(),
    3739                 :                      errmsg("could not rename file \"%s\": %m",
    3740                 :                             path)));
    3741                 :             return;
    3742                 :         }
    3743                 :         rc = durable_unlink(newpath, LOG);
    3744                 : #else
    3745 GIC          52 :         rc = durable_unlink(path, LOG);
    3746                 : #endif
    3747              52 :         if (rc != 0)
    3748                 :         {
    3749                 :             /* Message already logged by durable_unlink() */
    3750 UIC           0 :             return;
    3751                 :         }
    3752 GIC          52 :         CheckpointStats.ckpt_segs_removed++;
    3753                 :     }
    3754                 : 
    3755 CBC         585 :     XLogArchiveCleanup(segname);
    3756                 : }
    3757                 : 
    3758                 : /*
    3759                 :  * Verify whether pg_wal and pg_wal/archive_status exist.
    3760                 :  * If the latter does not exist, recreate it.
    3761 ECB             :  *
    3762                 :  * It is not the goal of this function to verify the contents of these
    3763 EUB             :  * directories, but to help in cases where someone has performed a cluster
    3764                 :  * copy for PITR purposes but omitted pg_wal from the copy.
    3765                 :  *
    3766                 :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
    3767                 :  * policy decision was made not to.  It is fairly common for pg_wal to be
    3768 ECB             :  * a symlink, and if that was the DBA's intent then automatically making a
    3769                 :  * plain directory would result in degraded performance with no notice.
    3770                 :  */
    3771                 : static void
    3772 CBC        1176 : ValidateXLOGDirectoryStructure(void)
    3773 EUB             : {
    3774                 :     char        path[MAXPGPATH];
    3775                 :     struct stat stat_buf;
    3776                 : 
    3777                 :     /* Check for pg_wal; if it doesn't exist, error out */
    3778 GIC        1176 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
    3779 GBC        1176 :         !S_ISDIR(stat_buf.st_mode))
    3780 UIC           0 :         ereport(FATAL,
    3781 EUB             :                 (errmsg("required WAL directory \"%s\" does not exist",
    3782                 :                         XLOGDIR)));
    3783                 : 
    3784                 :     /* Check for archive_status */
    3785 GIC        1176 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    3786 CBC        1176 :     if (stat(path, &stat_buf) == 0)
    3787                 :     {
    3788                 :         /* Check for weird cases where it exists but isn't a directory */
    3789 GIC        1176 :         if (!S_ISDIR(stat_buf.st_mode))
    3790 UIC           0 :             ereport(FATAL,
    3791                 :                     (errmsg("required WAL directory \"%s\" does not exist",
    3792                 :                             path)));
    3793                 :     }
    3794 ECB             :     else
    3795                 :     {
    3796 UIC           0 :         ereport(LOG,
    3797                 :                 (errmsg("creating missing WAL directory \"%s\"", path)));
    3798               0 :         if (MakePGDirectory(path) < 0)
    3799               0 :             ereport(FATAL,
    3800 ECB             :                     (errmsg("could not create missing directory \"%s\": %m",
    3801                 :                             path)));
    3802                 :     }
    3803 GIC        1176 : }
    3804 ECB             : 
    3805                 : /*
    3806                 :  * Remove previous backup history files.  This also retries creation of
    3807                 :  * .ready files for any backup history files for which XLogArchiveNotify
    3808                 :  * failed earlier.
    3809                 :  */
    3810                 : static void
    3811 CBC         117 : CleanupBackupHistory(void)
    3812 ECB             : {
    3813                 :     DIR        *xldir;
    3814                 :     struct dirent *xlde;
    3815                 :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
    3816                 : 
    3817 CBC         117 :     xldir = AllocateDir(XLOGDIR);
    3818 ECB             : 
    3819 GIC        1047 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    3820                 :     {
    3821             813 :         if (IsBackupHistoryFileName(xlde->d_name))
    3822                 :         {
    3823             118 :             if (XLogArchiveCheckDone(xlde->d_name))
    3824                 :             {
    3825             108 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
    3826                 :                      xlde->d_name);
    3827             108 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
    3828             108 :                 unlink(path);
    3829             108 :                 XLogArchiveCleanup(xlde->d_name);
    3830                 :             }
    3831                 :         }
    3832                 :     }
    3833                 : 
    3834             117 :     FreeDir(xldir);
    3835             117 : }
    3836                 : 
    3837 ECB             : /*
    3838                 :  * I/O routines for pg_control
    3839                 :  *
    3840                 :  * *ControlFile is a buffer in shared memory that holds an image of the
    3841                 :  * contents of pg_control.  WriteControlFile() initializes pg_control
    3842                 :  * given a preloaded buffer, ReadControlFile() loads the buffer from
    3843                 :  * the pg_control file (during postmaster or standalone-backend startup),
    3844                 :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
    3845                 :  * InitControlFile() fills the buffer with initial values.
    3846                 :  *
    3847                 :  * For simplicity, WriteControlFile() initializes the fields of pg_control
    3848 EUB             :  * that are related to checking backend/database compatibility, and
    3849                 :  * ReadControlFile() verifies they are correct.  We could split out the
    3850                 :  * I/O and compatibility-check functions, but there seems no need currently.
    3851                 :  */
    3852 ECB             : 
    3853                 : static void
    3854 CBC         305 : InitControlFile(uint64 sysidentifier)
    3855 ECB             : {
    3856                 :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
    3857                 : 
    3858                 :     /*
    3859                 :      * Generate a random nonce. This is used for authentication requests that
    3860                 :      * will fail because the user does not exist. The nonce is used to create
    3861                 :      * a genuine-looking password challenge for the non-existent user, in lieu
    3862                 :      * of an actual stored password.
    3863                 :      */
    3864 CBC         305 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
    3865 LBC           0 :         ereport(PANIC,
    3866 ECB             :                 (errcode(ERRCODE_INTERNAL_ERROR),
    3867                 :                  errmsg("could not generate secret authorization token")));
    3868                 : 
    3869 CBC         305 :     memset(ControlFile, 0, sizeof(ControlFileData));
    3870                 :     /* Initialize pg_control status fields */
    3871 GIC         305 :     ControlFile->system_identifier = sysidentifier;
    3872 CBC         305 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
    3873 GIC         305 :     ControlFile->state = DB_SHUTDOWNED;
    3874             305 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
    3875                 : 
    3876                 :     /* Set important parameter values for use when replaying WAL */
    3877             305 :     ControlFile->MaxConnections = MaxConnections;
    3878             305 :     ControlFile->max_worker_processes = max_worker_processes;
    3879             305 :     ControlFile->max_wal_senders = max_wal_senders;
    3880 CBC         305 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
    3881             305 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
    3882 GIC         305 :     ControlFile->wal_level = wal_level;
    3883 CBC         305 :     ControlFile->wal_log_hints = wal_log_hints;
    3884             305 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
    3885 GIC         305 :     ControlFile->data_checksum_version = bootstrap_data_checksum_version;
    3886 CBC         305 : }
    3887 ECB             : 
    3888                 : static void
    3889 CBC         305 : WriteControlFile(void)
    3890                 : {
    3891 ECB             :     int         fd;
    3892                 :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
    3893                 : 
    3894                 :     /*
    3895                 :      * Initialize version and compatibility-check fields
    3896                 :      */
    3897 GIC         305 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
    3898             305 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
    3899                 : 
    3900             305 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    3901             305 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
    3902                 : 
    3903             305 :     ControlFile->blcksz = BLCKSZ;
    3904 CBC         305 :     ControlFile->relseg_size = RELSEG_SIZE;
    3905             305 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    3906 GIC         305 :     ControlFile->xlog_seg_size = wal_segment_size;
    3907 ECB             : 
    3908 GIC         305 :     ControlFile->nameDataLen = NAMEDATALEN;
    3909 CBC         305 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
    3910 EUB             : 
    3911 GIC         305 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
    3912             305 :     ControlFile->loblksize = LOBLKSIZE;
    3913                 : 
    3914             305 :     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
    3915 ECB             : 
    3916                 :     /* Contents are protected with a CRC */
    3917 CBC         305 :     INIT_CRC32C(ControlFile->crc);
    3918 GIC         305 :     COMP_CRC32C(ControlFile->crc,
    3919                 :                 (char *) ControlFile,
    3920 EUB             :                 offsetof(ControlFileData, crc));
    3921 GBC         305 :     FIN_CRC32C(ControlFile->crc);
    3922 EUB             : 
    3923                 :     /*
    3924                 :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
    3925                 :      * the excess over sizeof(ControlFileData).  This reduces the odds of
    3926                 :      * premature-EOF errors when reading pg_control.  We'll still fail when we
    3927 ECB             :      * check the contents of the file, but hopefully with a more specific
    3928                 :      * error than "couldn't read pg_control".
    3929                 :      */
    3930 CBC         305 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
    3931 GBC         305 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
    3932                 : 
    3933 GIC         305 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    3934                 :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
    3935 CBC         305 :     if (fd < 0)
    3936 UIC           0 :         ereport(PANIC,
    3937 ECB             :                 (errcode_for_file_access(),
    3938 EUB             :                  errmsg("could not create file \"%s\": %m",
    3939                 :                         XLOG_CONTROL_FILE)));
    3940                 : 
    3941 GIC         305 :     errno = 0;
    3942 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
    3943 GIC         305 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
    3944                 :     {
    3945 ECB             :         /* if write didn't set errno, assume problem is no disk space */
    3946 UIC           0 :         if (errno == 0)
    3947               0 :             errno = ENOSPC;
    3948               0 :         ereport(PANIC,
    3949                 :                 (errcode_for_file_access(),
    3950                 :                  errmsg("could not write to file \"%s\": %m",
    3951                 :                         XLOG_CONTROL_FILE)));
    3952                 :     }
    3953 GIC         305 :     pgstat_report_wait_end();
    3954                 : 
    3955 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
    3956 GIC         305 :     if (pg_fsync(fd) != 0)
    3957 LBC           0 :         ereport(PANIC,
    3958 EUB             :                 (errcode_for_file_access(),
    3959                 :                  errmsg("could not fsync file \"%s\": %m",
    3960                 :                         XLOG_CONTROL_FILE)));
    3961 GIC         305 :     pgstat_report_wait_end();
    3962                 : 
    3963 CBC         305 :     if (close(fd) != 0)
    3964 LBC           0 :         ereport(PANIC,
    3965 ECB             :                 (errcode_for_file_access(),
    3966                 :                  errmsg("could not close file \"%s\": %m",
    3967 EUB             :                         XLOG_CONTROL_FILE)));
    3968 GBC         305 : }
    3969                 : 
    3970                 : static void
    3971 GIC        1222 : ReadControlFile(void)
    3972                 : {
    3973 EUB             :     pg_crc32c   crc;
    3974                 :     int         fd;
    3975                 :     static char wal_segsz_str[20];
    3976                 :     int         r;
    3977                 : 
    3978 ECB             :     /*
    3979                 :      * Read data...
    3980                 :      */
    3981 GIC        1222 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
    3982                 :                        O_RDWR | PG_BINARY);
    3983            1222 :     if (fd < 0)
    3984 UIC           0 :         ereport(PANIC,
    3985                 :                 (errcode_for_file_access(),
    3986                 :                  errmsg("could not open file \"%s\": %m",
    3987                 :                         XLOG_CONTROL_FILE)));
    3988                 : 
    3989 CBC        1222 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
    3990 GBC        1222 :     r = read(fd, ControlFile, sizeof(ControlFileData));
    3991 GIC        1222 :     if (r != sizeof(ControlFileData))
    3992                 :     {
    3993 UIC           0 :         if (r < 0)
    3994               0 :             ereport(PANIC,
    3995                 :                     (errcode_for_file_access(),
    3996                 :                      errmsg("could not read file \"%s\": %m",
    3997                 :                             XLOG_CONTROL_FILE)));
    3998 ECB             :         else
    3999 UBC           0 :             ereport(PANIC,
    4000                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    4001                 :                      errmsg("could not read file \"%s\": read %d of %zu",
    4002                 :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
    4003                 :     }
    4004 GIC        1222 :     pgstat_report_wait_end();
    4005                 : 
    4006            1222 :     close(fd);
    4007 ECB             : 
    4008                 :     /*
    4009                 :      * Check for expected pg_control format version.  If this is wrong, the
    4010                 :      * CRC check will likely fail because we'll be checking the wrong number
    4011                 :      * of bytes.  Complaining about wrong version will probably be more
    4012                 :      * enlightening than complaining about wrong CRC.
    4013                 :      */
    4014 EUB             : 
    4015 GIC        1222 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
    4016 UIC           0 :         ereport(FATAL,
    4017                 :                 (errmsg("database files are incompatible with server"),
    4018                 :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
    4019                 :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
    4020                 :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
    4021                 :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
    4022 ECB             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
    4023 EUB             : 
    4024 GIC        1222 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
    4025 UIC           0 :         ereport(FATAL,
    4026                 :                 (errmsg("database files are incompatible with server"),
    4027                 :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
    4028                 :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
    4029 ECB             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
    4030 EUB             :                  errhint("It looks like you need to initdb.")));
    4031                 : 
    4032                 :     /* Now check the CRC. */
    4033 GIC        1222 :     INIT_CRC32C(crc);
    4034            1222 :     COMP_CRC32C(crc,
    4035                 :                 (char *) ControlFile,
    4036 ECB             :                 offsetof(ControlFileData, crc));
    4037 GBC        1222 :     FIN_CRC32C(crc);
    4038                 : 
    4039 GIC        1222 :     if (!EQ_CRC32C(crc, ControlFile->crc))
    4040 UIC           0 :         ereport(FATAL,
    4041 ECB             :                 (errmsg("incorrect checksum in control file")));
    4042 EUB             : 
    4043                 :     /*
    4044                 :      * Do compatibility checking immediately.  If the database isn't
    4045                 :      * compatible with the backend executable, we want to abort before we can
    4046                 :      * possibly do any damage.
    4047                 :      */
    4048 CBC        1222 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
    4049 UBC           0 :         ereport(FATAL,
    4050                 :                 (errmsg("database files are incompatible with server"),
    4051                 :                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
    4052                 :                            " but the server was compiled with CATALOG_VERSION_NO %d.",
    4053                 :                            ControlFile->catalog_version_no, CATALOG_VERSION_NO),
    4054                 :                  errhint("It looks like you need to initdb.")));
    4055 CBC        1222 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
    4056 UBC           0 :         ereport(FATAL,
    4057                 :                 (errmsg("database files are incompatible with server"),
    4058                 :                  errdetail("The database cluster was initialized with MAXALIGN %d,"
    4059                 :                            " but the server was compiled with MAXALIGN %d.",
    4060                 :                            ControlFile->maxAlign, MAXIMUM_ALIGNOF),
    4061                 :                  errhint("It looks like you need to initdb.")));
    4062 CBC        1222 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
    4063 UBC           0 :         ereport(FATAL,
    4064                 :                 (errmsg("database files are incompatible with server"),
    4065                 :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
    4066                 :                  errhint("It looks like you need to initdb.")));
    4067 GIC        1222 :     if (ControlFile->blcksz != BLCKSZ)
    4068 UIC           0 :         ereport(FATAL,
    4069 ECB             :                 (errmsg("database files are incompatible with server"),
    4070 EUB             :                  errdetail("The database cluster was initialized with BLCKSZ %d,"
    4071                 :                            " but the server was compiled with BLCKSZ %d.",
    4072                 :                            ControlFile->blcksz, BLCKSZ),
    4073                 :                  errhint("It looks like you need to recompile or initdb.")));
    4074 GIC        1222 :     if (ControlFile->relseg_size != RELSEG_SIZE)
    4075 UIC           0 :         ereport(FATAL,
    4076 ECB             :                 (errmsg("database files are incompatible with server"),
    4077 EUB             :                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
    4078                 :                            " but the server was compiled with RELSEG_SIZE %d.",
    4079                 :                            ControlFile->relseg_size, RELSEG_SIZE),
    4080                 :                  errhint("It looks like you need to recompile or initdb.")));
    4081 GIC        1222 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
    4082 UIC           0 :         ereport(FATAL,
    4083 ECB             :                 (errmsg("database files are incompatible with server"),
    4084 EUB             :                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
    4085                 :                            " but the server was compiled with XLOG_BLCKSZ %d.",
    4086                 :                            ControlFile->xlog_blcksz, XLOG_BLCKSZ),
    4087                 :                  errhint("It looks like you need to recompile or initdb.")));
    4088 GIC        1222 :     if (ControlFile->nameDataLen != NAMEDATALEN)
    4089 UIC           0 :         ereport(FATAL,
    4090                 :                 (errmsg("database files are incompatible with server"),
    4091                 :                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
    4092 ECB             :                            " but the server was compiled with NAMEDATALEN %d.",
    4093 EUB             :                            ControlFile->nameDataLen, NAMEDATALEN),
    4094                 :                  errhint("It looks like you need to recompile or initdb.")));
    4095 GIC        1222 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
    4096 UIC           0 :         ereport(FATAL,
    4097                 :                 (errmsg("database files are incompatible with server"),
    4098                 :                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
    4099                 :                            " but the server was compiled with INDEX_MAX_KEYS %d.",
    4100                 :                            ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
    4101                 :                  errhint("It looks like you need to recompile or initdb.")));
    4102 GIC        1222 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
    4103 UIC           0 :         ereport(FATAL,
    4104                 :                 (errmsg("database files are incompatible with server"),
    4105                 :                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
    4106                 :                            " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
    4107 ECB             :                            ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
    4108                 :                  errhint("It looks like you need to recompile or initdb.")));
    4109 CBC        1222 :     if (ControlFile->loblksize != LOBLKSIZE)
    4110 UBC           0 :         ereport(FATAL,
    4111                 :                 (errmsg("database files are incompatible with server"),
    4112                 :                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
    4113                 :                            " but the server was compiled with LOBLKSIZE %d.",
    4114                 :                            ControlFile->loblksize, (int) LOBLKSIZE),
    4115                 :                  errhint("It looks like you need to recompile or initdb.")));
    4116 ECB             : 
    4117                 : #ifdef USE_FLOAT8_BYVAL
    4118 GIC        1222 :     if (ControlFile->float8ByVal != true)
    4119 UIC           0 :         ereport(FATAL,
    4120                 :                 (errmsg("database files are incompatible with server"),
    4121 ECB             :                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
    4122 EUB             :                            " but the server was compiled with USE_FLOAT8_BYVAL."),
    4123                 :                  errhint("It looks like you need to recompile or initdb.")));
    4124                 : #else
    4125 ECB             :     if (ControlFile->float8ByVal != false)
    4126 EUB             :         ereport(FATAL,
    4127                 :                 (errmsg("database files are incompatible with server"),
    4128                 :                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
    4129 ECB             :                            " but the server was compiled without USE_FLOAT8_BYVAL."),
    4130                 :                  errhint("It looks like you need to recompile or initdb.")));
    4131                 : #endif
    4132                 : 
    4133 CBC        1222 :     wal_segment_size = ControlFile->xlog_seg_size;
    4134                 : 
    4135 GIC        1222 :     if (!IsValidWalSegSize(wal_segment_size))
    4136 LBC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4137                 :                         errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
    4138 ECB             :                                       "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
    4139                 :                                       wal_segment_size,
    4140                 :                                       wal_segment_size)));
    4141                 : 
    4142 GIC        1222 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
    4143            1222 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
    4144                 :                     PGC_S_DYNAMIC_DEFAULT);
    4145 ECB             : 
    4146                 :     /* check and update variables dependent on wal_segment_size */
    4147 CBC        1222 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
    4148 LBC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4149                 :                         errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
    4150                 : 
    4151 GIC        1222 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
    4152 UIC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4153                 :                         errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
    4154 ECB             : 
    4155 GIC        1222 :     UsableBytesInSegment =
    4156 CBC        1222 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
    4157 ECB             :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
    4158                 : 
    4159 GIC        1222 :     CalculateCheckpointSegments();
    4160                 : 
    4161                 :     /* Make the initdb settings visible as GUC variables, too */
    4162            1222 :     SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
    4163                 :                     PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
    4164 CBC        1222 : }
    4165                 : 
    4166 ECB             : /*
    4167                 :  * Utility wrapper to update the control file.  Note that the control
    4168                 :  * file gets flushed.
    4169                 :  */
    4170                 : static void
    4171 GIC       10909 : UpdateControlFile(void)
    4172                 : {
    4173           10909 :     update_controlfile(DataDir, ControlFile, true);
    4174 CBC       10909 : }
    4175                 : 
    4176 ECB             : /*
    4177                 :  * Returns the unique system identifier from control file.
    4178                 :  */
    4179                 : uint64
    4180 GIC         926 : GetSystemIdentifier(void)
    4181                 : {
    4182             926 :     Assert(ControlFile != NULL);
    4183             926 :     return ControlFile->system_identifier;
    4184                 : }
    4185                 : 
    4186                 : /*
    4187                 :  * Returns the random nonce from control file.
    4188                 :  */
    4189                 : char *
    4190 CBC           1 : GetMockAuthenticationNonce(void)
    4191                 : {
    4192 GIC           1 :     Assert(ControlFile != NULL);
    4193               1 :     return ControlFile->mock_authentication_nonce;
    4194                 : }
    4195 ECB             : 
    4196                 : /*
    4197                 :  * Are checksums enabled for data pages?
    4198                 :  */
    4199                 : bool
    4200 GIC    14333955 : DataChecksumsEnabled(void)
    4201                 : {
    4202        14333955 :     Assert(ControlFile != NULL);
    4203        14333955 :     return (ControlFile->data_checksum_version > 0);
    4204                 : }
    4205                 : 
    4206                 : /*
    4207                 :  * Returns a fake LSN for unlogged relations.
    4208                 :  *
    4209                 :  * Each call generates an LSN that is greater than any previous value
    4210                 :  * returned. The current counter value is saved and restored across clean
    4211                 :  * shutdowns, but like unlogged relations, does not survive a crash. This can
    4212                 :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
    4213                 :  * LSN-like increasing sequence of numbers without writing any WAL.
    4214 ECB             :  */
    4215                 : XLogRecPtr
    4216 GIC          33 : GetFakeLSNForUnloggedRel(void)
    4217                 : {
    4218 ECB             :     XLogRecPtr  nextUnloggedLSN;
    4219                 : 
    4220                 :     /* increment the unloggedLSN counter, need SpinLock */
    4221 CBC          33 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    4222              33 :     nextUnloggedLSN = XLogCtl->unloggedLSN++;
    4223              33 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    4224                 : 
    4225 GIC          33 :     return nextUnloggedLSN;
    4226                 : }
    4227                 : 
    4228                 : /*
    4229                 :  * Auto-tune the number of XLOG buffers.
    4230 ECB             :  *
    4231                 :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
    4232                 :  * a maximum of one XLOG segment (there is little reason to think that more
    4233                 :  * is helpful, at least so long as we force an fsync when switching log files)
    4234                 :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
    4235                 :  * 9.1, when auto-tuning was added).
    4236                 :  *
    4237                 :  * This should not be called until NBuffers has received its final value.
    4238                 :  */
    4239                 : static int
    4240 GIC        1825 : XLOGChooseNumBuffers(void)
    4241 ECB             : {
    4242                 :     int         xbuffers;
    4243                 : 
    4244 GIC        1825 :     xbuffers = NBuffers / 32;
    4245 GBC        1825 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
    4246 GIC          18 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
    4247            1825 :     if (xbuffers < 8)
    4248             327 :         xbuffers = 8;
    4249            1825 :     return xbuffers;
    4250                 : }
    4251                 : 
    4252                 : /*
    4253                 :  * GUC check_hook for wal_buffers
    4254                 :  */
    4255 ECB             : bool
    4256 GBC        3682 : check_wal_buffers(int *newval, void **extra, GucSource source)
    4257                 : {
    4258 ECB             :     /*
    4259                 :      * -1 indicates a request for auto-tune.
    4260                 :      */
    4261 GIC        3682 :     if (*newval == -1)
    4262                 :     {
    4263                 :         /*
    4264                 :          * If we haven't yet changed the boot_val default of -1, just let it
    4265 ECB             :          * be.  We'll fix it when XLOGShmemSize is called.
    4266                 :          */
    4267 GIC        1857 :         if (XLOGbuffers == -1)
    4268            1857 :             return true;
    4269                 : 
    4270                 :         /* Otherwise, substitute the auto-tune value */
    4271 UIC           0 :         *newval = XLOGChooseNumBuffers();
    4272                 :     }
    4273 ECB             : 
    4274                 :     /*
    4275                 :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
    4276                 :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
    4277                 :      * the case, we just silently treat such values as a request for the
    4278                 :      * minimum.  (We could throw an error instead, but that doesn't seem very
    4279                 :      * helpful.)
    4280                 :      */
    4281 GIC        1825 :     if (*newval < 4)
    4282 UBC           0 :         *newval = 4;
    4283 EUB             : 
    4284 GBC        1825 :     return true;
    4285 EUB             : }
    4286                 : 
    4287                 : /*
    4288                 :  * GUC check_hook for wal_consistency_checking
    4289                 :  */
    4290                 : bool
    4291 GNC        1859 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
    4292                 : {
    4293                 :     char       *rawstring;
    4294                 :     List       *elemlist;
    4295                 :     ListCell   *l;
    4296                 :     bool        newwalconsistency[RM_MAX_ID + 1];
    4297                 : 
    4298                 :     /* Initialize the array */
    4299           61347 :     MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
    4300                 : 
    4301                 :     /* Need a modifiable copy of string */
    4302            1859 :     rawstring = pstrdup(*newval);
    4303                 : 
    4304                 :     /* Parse string into list of identifiers */
    4305            1859 :     if (!SplitIdentifierString(rawstring, ',', &elemlist))
    4306                 :     {
    4307                 :         /* syntax error in list */
    4308 UNC           0 :         GUC_check_errdetail("List syntax is invalid.");
    4309               0 :         pfree(rawstring);
    4310               0 :         list_free(elemlist);
    4311               0 :         return false;
    4312                 :     }
    4313                 : 
    4314 GNC        1861 :     foreach(l, elemlist)
    4315                 :     {
    4316               2 :         char       *tok = (char *) lfirst(l);
    4317                 :         int         rmid;
    4318                 : 
    4319                 :         /* Check for 'all'. */
    4320               2 :         if (pg_strcasecmp(tok, "all") == 0)
    4321                 :         {
    4322 UNC           0 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    4323               0 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
    4324               0 :                     newwalconsistency[rmid] = true;
    4325                 :         }
    4326                 :         else
    4327                 :         {
    4328                 :             /* Check if the token matches any known resource manager. */
    4329 GNC           2 :             bool        found = false;
    4330                 : 
    4331              36 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    4332                 :             {
    4333              54 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
    4334              18 :                     pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
    4335                 :                 {
    4336               2 :                     newwalconsistency[rmid] = true;
    4337               2 :                     found = true;
    4338               2 :                     break;
    4339                 :                 }
    4340                 :             }
    4341               2 :             if (!found)
    4342                 :             {
    4343                 :                 /*
    4344                 :                  * During startup, it might be a not-yet-loaded custom
    4345                 :                  * resource manager.  Defer checking until
    4346                 :                  * InitializeWalConsistencyChecking().
    4347                 :                  */
    4348 UNC           0 :                 if (!process_shared_preload_libraries_done)
    4349                 :                 {
    4350               0 :                     check_wal_consistency_checking_deferred = true;
    4351                 :                 }
    4352                 :                 else
    4353                 :                 {
    4354               0 :                     GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
    4355               0 :                     pfree(rawstring);
    4356               0 :                     list_free(elemlist);
    4357               0 :                     return false;
    4358                 :                 }
    4359                 :             }
    4360                 :         }
    4361                 :     }
    4362                 : 
    4363 GNC        1859 :     pfree(rawstring);
    4364            1859 :     list_free(elemlist);
    4365                 : 
    4366                 :     /* assign new value */
    4367            1859 :     *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
    4368            1859 :     memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
    4369            1859 :     return true;
    4370                 : }
    4371                 : 
    4372                 : /*
    4373                 :  * GUC assign_hook for wal_consistency_checking
    4374                 :  */
    4375                 : void
    4376            1859 : assign_wal_consistency_checking(const char *newval, void *extra)
    4377                 : {
    4378                 :     /*
    4379                 :      * If some checks were deferred, it's possible that the checks will fail
    4380                 :      * later during InitializeWalConsistencyChecking(). But in that case, the
    4381                 :      * postmaster will exit anyway, so it's safe to proceed with the
    4382                 :      * assignment.
    4383                 :      *
    4384                 :      * Any built-in resource managers specified are assigned immediately,
    4385                 :      * which affects WAL created before shared_preload_libraries are
    4386                 :      * processed. Any custom resource managers specified won't be assigned
    4387                 :      * until after shared_preload_libraries are processed, but that's OK
    4388                 :      * because WAL for a custom resource manager can't be written before the
    4389                 :      * module is loaded anyway.
    4390                 :      */
    4391            1859 :     wal_consistency_checking = extra;
    4392            1859 : }
    4393                 : 
    4394                 : /*
    4395                 :  * InitializeWalConsistencyChecking: run after loading custom resource managers
    4396                 :  *
    4397                 :  * If any unknown resource managers were specified in the
    4398                 :  * wal_consistency_checking GUC, processing was deferred.  Now that
    4399                 :  * shared_preload_libraries have been loaded, process wal_consistency_checking
    4400                 :  * again.
    4401                 :  */
    4402                 : void
    4403             910 : InitializeWalConsistencyChecking(void)
    4404                 : {
    4405             910 :     Assert(process_shared_preload_libraries_done);
    4406                 : 
    4407             910 :     if (check_wal_consistency_checking_deferred)
    4408                 :     {
    4409                 :         struct config_generic *guc;
    4410                 : 
    4411 UNC           0 :         guc = find_option("wal_consistency_checking", false, false, ERROR);
    4412                 : 
    4413               0 :         check_wal_consistency_checking_deferred = false;
    4414                 : 
    4415               0 :         set_config_option_ext("wal_consistency_checking",
    4416                 :                               wal_consistency_checking_string,
    4417                 :                               guc->scontext, guc->source, guc->srole,
    4418                 :                               GUC_ACTION_SET, true, ERROR, false);
    4419                 : 
    4420                 :         /* checking should not be deferred again */
    4421               0 :         Assert(!check_wal_consistency_checking_deferred);
    4422                 :     }
    4423 GNC         910 : }
    4424                 : 
    4425                 : /*
    4426                 :  * GUC show_hook for archive_command
    4427                 :  */
    4428                 : const char *
    4429            1088 : show_archive_command(void)
    4430                 : {
    4431            1088 :     if (XLogArchivingActive())
    4432 UNC           0 :         return XLogArchiveCommand;
    4433                 :     else
    4434 GNC        1088 :         return "(disabled)";
    4435                 : }
    4436                 : 
    4437                 : /*
    4438                 :  * GUC show_hook for in_hot_standby
    4439                 :  */
    4440                 : const char *
    4441            9962 : show_in_hot_standby(void)
    4442                 : {
    4443                 :     /*
    4444                 :      * We display the actual state based on shared memory, so that this GUC
    4445                 :      * reports up-to-date state if examined intra-query.  The underlying
    4446                 :      * variable (in_hot_standby_guc) changes only when we transmit a new value
    4447                 :      * to the client.
    4448                 :      */
    4449            9962 :     return RecoveryInProgress() ? "on" : "off";
    4450                 : }
    4451                 : 
    4452                 : /*
    4453 ECB             :  * Read the control file, set respective GUCs.
    4454                 :  *
    4455                 :  * This is to be called during startup, including a crash recovery cycle,
    4456                 :  * unless in bootstrap mode, where no control file yet exists.  As there's no
    4457                 :  * usable shared memory yet (its sizing can depend on the contents of the
    4458                 :  * control file!), first store the contents in local memory. XLOGShmemInit()
    4459                 :  * will then copy it to shared memory later.
    4460                 :  *
    4461 EUB             :  * reset just controls whether previous contents are to be expected (in the
    4462                 :  * reset case, there's a dangling pointer into old shared memory), or not.
    4463                 :  */
    4464                 : void
    4465 GIC         917 : LocalProcessControlFile(bool reset)
    4466                 : {
    4467             917 :     Assert(reset || ControlFile == NULL);
    4468 CBC         917 :     ControlFile = palloc(sizeof(ControlFileData));
    4469 GIC         917 :     ReadControlFile();
    4470 CBC         917 : }
    4471                 : 
    4472                 : /*
    4473                 :  * Get the wal_level from the control file. For a standby, this value should be
    4474                 :  * considered as its active wal_level, because it may be different from what
    4475                 :  * was originally configured on standby.
    4476                 :  */
    4477                 : WalLevel
    4478 GNC          61 : GetActiveWalLevelOnStandby(void)
    4479                 : {
    4480              61 :     return ControlFile->wal_level;
    4481                 : }
    4482                 : 
    4483 ECB             : /*
    4484                 :  * Initialization of shared memory for XLOG
    4485                 :  */
    4486                 : Size
    4487 CBC        4564 : XLOGShmemSize(void)
    4488 ECB             : {
    4489                 :     Size        size;
    4490                 : 
    4491                 :     /*
    4492                 :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
    4493                 :      * This isn't an amazingly clean place to do this, but we must wait till
    4494                 :      * NBuffers has received its final value, and must do it before using the
    4495                 :      * value of XLOGbuffers to do anything important.
    4496                 :      *
    4497                 :      * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
    4498 EUB             :      * However, if the DBA explicitly set wal_buffers = -1 in the config file,
    4499                 :      * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
    4500                 :      * the matter with PGC_S_OVERRIDE.
    4501                 :      */
    4502 GIC        4564 :     if (XLOGbuffers == -1)
    4503                 :     {
    4504 EUB             :         char        buf[32];
    4505                 : 
    4506 GBC        1825 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
    4507            1825 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    4508                 :                         PGC_S_DYNAMIC_DEFAULT);
    4509 GIC        1825 :         if (XLOGbuffers == -1)  /* failed to apply it? */
    4510 UIC           0 :             SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
    4511                 :                             PGC_S_OVERRIDE);
    4512                 :     }
    4513 CBC        4564 :     Assert(XLOGbuffers > 0);
    4514 ECB             : 
    4515                 :     /* XLogCtl */
    4516 GIC        4564 :     size = sizeof(XLogCtlData);
    4517 ECB             : 
    4518                 :     /* WAL insertion locks, plus alignment */
    4519 CBC        4564 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
    4520                 :     /* xlblocks array */
    4521 GIC        4564 :     size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    4522                 :     /* extra alignment padding for XLOG I/O buffers */
    4523 GNC        4564 :     size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
    4524                 :     /* and the buffers themselves */
    4525 GIC        4564 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
    4526 ECB             : 
    4527                 :     /*
    4528                 :      * Note: we don't count ControlFileData, it comes out of the "slop factor"
    4529                 :      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
    4530                 :      * routine again below to compute the actual allocation size.
    4531                 :      */
    4532                 : 
    4533 GIC        4564 :     return size;
    4534                 : }
    4535                 : 
    4536                 : void
    4537            1826 : XLOGShmemInit(void)
    4538                 : {
    4539                 :     bool        foundCFile,
    4540                 :                 foundXLog;
    4541 ECB             :     char       *allocptr;
    4542                 :     int         i;
    4543                 :     ControlFileData *localControlFile;
    4544                 : 
    4545                 : #ifdef WAL_DEBUG
    4546                 : 
    4547                 :     /*
    4548                 :      * Create a memory context for WAL debugging that's exempt from the normal
    4549                 :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
    4550                 :      * an allocation fails, but wal_debug is not for production use anyway.
    4551                 :      */
    4552                 :     if (walDebugCxt == NULL)
    4553                 :     {
    4554                 :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
    4555                 :                                             "WAL Debug",
    4556                 :                                             ALLOCSET_DEFAULT_SIZES);
    4557                 :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
    4558                 :     }
    4559                 : #endif
    4560                 : 
    4561 EUB             : 
    4562 GIC        1826 :     XLogCtl = (XLogCtlData *)
    4563 GBC        1826 :         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
    4564                 : 
    4565            1826 :     localControlFile = ControlFile;
    4566 GIC        1826 :     ControlFile = (ControlFileData *)
    4567            1826 :         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    4568                 : 
    4569            1826 :     if (foundCFile || foundXLog)
    4570                 :     {
    4571 EUB             :         /* both should be present or neither */
    4572 UIC           0 :         Assert(foundCFile && foundXLog);
    4573 ECB             : 
    4574                 :         /* Initialize local copy of WALInsertLocks */
    4575 UIC           0 :         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
    4576                 : 
    4577               0 :         if (localControlFile)
    4578               0 :             pfree(localControlFile);
    4579 LBC           0 :         return;
    4580                 :     }
    4581 CBC        1826 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
    4582 EUB             : 
    4583                 :     /*
    4584 ECB             :      * Already have read control file locally, unless in bootstrap mode. Move
    4585                 :      * contents into shared memory.
    4586                 :      */
    4587 GIC        1826 :     if (localControlFile)
    4588                 :     {
    4589             911 :         memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
    4590             911 :         pfree(localControlFile);
    4591 ECB             :     }
    4592                 : 
    4593                 :     /*
    4594                 :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
    4595                 :      * multiple of the alignment for same, so no extra alignment padding is
    4596                 :      * needed here.
    4597                 :      */
    4598 GIC        1826 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    4599 CBC        1826 :     XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    4600 GIC        1826 :     memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    4601            1826 :     allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
    4602                 : 
    4603                 : 
    4604                 :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
    4605            1826 :     allocptr += sizeof(WALInsertLockPadded) -
    4606            1826 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
    4607            1826 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
    4608                 :         (WALInsertLockPadded *) allocptr;
    4609            1826 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
    4610                 : 
    4611           16434 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    4612                 :     {
    4613           14608 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
    4614           14608 :         WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
    4615 CBC       14608 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
    4616                 :     }
    4617 ECB             : 
    4618                 :     /*
    4619                 :      * Align the start of the page buffers to a full xlog block size boundary.
    4620                 :      * This simplifies some calculations in XLOG insertion. It is also
    4621                 :      * required for O_DIRECT.
    4622                 :      */
    4623 GIC        1826 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
    4624            1826 :     XLogCtl->pages = allocptr;
    4625            1826 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
    4626                 : 
    4627                 :     /*
    4628 ECB             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
    4629                 :      * in additional info.)
    4630                 :      */
    4631 GIC        1826 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    4632            1826 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    4633            1826 :     XLogCtl->InstallXLogFileSegmentActive = false;
    4634            1826 :     XLogCtl->WalWriterSleeping = false;
    4635                 : 
    4636            1826 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
    4637 CBC        1826 :     SpinLockInit(&XLogCtl->info_lck);
    4638 GIC        1826 :     SpinLockInit(&XLogCtl->ulsn_lck);
    4639                 : }
    4640                 : 
    4641                 : /*
    4642                 :  * This func must be called ONCE on system install.  It creates pg_control
    4643                 :  * and the initial XLOG segment.
    4644                 :  */
    4645                 : void
    4646             305 : BootStrapXLOG(void)
    4647                 : {
    4648                 :     CheckPoint  checkPoint;
    4649                 :     char       *buffer;
    4650                 :     XLogPageHeader page;
    4651                 :     XLogLongPageHeader longpage;
    4652 ECB             :     XLogRecord *record;
    4653                 :     char       *recptr;
    4654                 :     uint64      sysidentifier;
    4655                 :     struct timeval tv;
    4656                 :     pg_crc32c   crc;
    4657                 : 
    4658                 :     /* allow ordinary WAL segment creation, like StartupXLOG() would */
    4659 GNC         305 :     SetInstallXLogFileSegmentActive();
    4660                 : 
    4661 ECB             :     /*
    4662                 :      * Select a hopefully-unique system identifier code for this installation.
    4663                 :      * We use the result of gettimeofday(), including the fractional seconds
    4664                 :      * field, as being about as unique as we can easily get.  (Think not to
    4665                 :      * use random(), since it hasn't been seeded and there's no portable way
    4666                 :      * to seed it other than the system clock value...)  The upper half of the
    4667                 :      * uint64 value is just the tv_sec part, while the lower half contains the
    4668                 :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
    4669                 :      * PID for a little extra uniqueness.  A person knowing this encoding can
    4670                 :      * determine the initialization time of the installation, which could
    4671                 :      * perhaps be useful sometimes.
    4672                 :      */
    4673 CBC         305 :     gettimeofday(&tv, NULL);
    4674 GIC         305 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
    4675             305 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
    4676             305 :     sysidentifier |= getpid() & 0xFFF;
    4677                 : 
    4678                 :     /* page buffer must be aligned suitably for O_DIRECT */
    4679             305 :     buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
    4680             305 :     page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
    4681 CBC         305 :     memset(page, 0, XLOG_BLCKSZ);
    4682                 : 
    4683                 :     /*
    4684                 :      * Set up information for the initial checkpoint record
    4685 ECB             :      *
    4686                 :      * The initial checkpoint record is written to the beginning of the WAL
    4687                 :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
    4688                 :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
    4689                 :      */
    4690 GIC         305 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
    4691             305 :     checkPoint.ThisTimeLineID = BootstrapTimeLineID;
    4692             305 :     checkPoint.PrevTimeLineID = BootstrapTimeLineID;
    4693             305 :     checkPoint.fullPageWrites = fullPageWrites;
    4694                 :     checkPoint.nextXid =
    4695             305 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
    4696             305 :     checkPoint.nextOid = FirstGenbkiObjectId;
    4697             305 :     checkPoint.nextMulti = FirstMultiXactId;
    4698             305 :     checkPoint.nextMultiOffset = 0;
    4699             305 :     checkPoint.oldestXid = FirstNormalTransactionId;
    4700             305 :     checkPoint.oldestXidDB = Template1DbOid;
    4701             305 :     checkPoint.oldestMulti = FirstMultiXactId;
    4702             305 :     checkPoint.oldestMultiDB = Template1DbOid;
    4703             305 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
    4704             305 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
    4705             305 :     checkPoint.time = (pg_time_t) time(NULL);
    4706             305 :     checkPoint.oldestActiveXid = InvalidTransactionId;
    4707                 : 
    4708             305 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
    4709             305 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    4710 CBC         305 :     ShmemVariableCache->oidCount = 0;
    4711             305 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    4712 GIC         305 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    4713 CBC         305 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    4714             305 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    4715             305 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
    4716                 : 
    4717 ECB             :     /* Set up the XLOG page header */
    4718 GIC         305 :     page->xlp_magic = XLOG_PAGE_MAGIC;
    4719             305 :     page->xlp_info = XLP_LONG_HEADER;
    4720 GBC         305 :     page->xlp_tli = BootstrapTimeLineID;
    4721 GIC         305 :     page->xlp_pageaddr = wal_segment_size;
    4722             305 :     longpage = (XLogLongPageHeader) page;
    4723 GBC         305 :     longpage->xlp_sysid = sysidentifier;
    4724 GIC         305 :     longpage->xlp_seg_size = wal_segment_size;
    4725 GBC         305 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
    4726 EUB             : 
    4727                 :     /* Insert the initial checkpoint record */
    4728 GIC         305 :     recptr = ((char *) page + SizeOfXLogLongPHD);
    4729 CBC         305 :     record = (XLogRecord *) recptr;
    4730 GIC         305 :     record->xl_prev = 0;
    4731             305 :     record->xl_xid = InvalidTransactionId;
    4732             305 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
    4733             305 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    4734             305 :     record->xl_rmid = RM_XLOG_ID;
    4735 CBC         305 :     recptr += SizeOfXLogRecord;
    4736                 :     /* fill the XLogRecordDataHeaderShort struct */
    4737             305 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
    4738             305 :     *(recptr++) = sizeof(checkPoint);
    4739 GIC         305 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
    4740             305 :     recptr += sizeof(checkPoint);
    4741             305 :     Assert(recptr - (char *) record == record->xl_tot_len);
    4742                 : 
    4743             305 :     INIT_CRC32C(crc);
    4744             305 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    4745             305 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    4746 CBC         305 :     FIN_CRC32C(crc);
    4747             305 :     record->xl_crc = crc;
    4748 ECB             : 
    4749                 :     /* Create first XLOG segment file */
    4750 GIC         305 :     openLogTLI = BootstrapTimeLineID;
    4751             305 :     openLogFile = XLogFileInit(1, BootstrapTimeLineID);
    4752                 : 
    4753 ECB             :     /*
    4754                 :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
    4755                 :      * close the file again in a moment.
    4756                 :      */
    4757                 : 
    4758                 :     /* Write the first page with the initial record */
    4759 CBC         305 :     errno = 0;
    4760 GIC         305 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    4761 CBC         305 :     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    4762 ECB             :     {
    4763                 :         /* if write didn't set errno, assume problem is no disk space */
    4764 UIC           0 :         if (errno == 0)
    4765               0 :             errno = ENOSPC;
    4766               0 :         ereport(PANIC,
    4767                 :                 (errcode_for_file_access(),
    4768                 :                  errmsg("could not write bootstrap write-ahead log file: %m")));
    4769                 :     }
    4770 GIC         305 :     pgstat_report_wait_end();
    4771 ECB             : 
    4772 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    4773             305 :     if (pg_fsync(openLogFile) != 0)
    4774 UIC           0 :         ereport(PANIC,
    4775                 :                 (errcode_for_file_access(),
    4776                 :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
    4777 GIC         305 :     pgstat_report_wait_end();
    4778                 : 
    4779 CBC         305 :     if (close(openLogFile) != 0)
    4780 LBC           0 :         ereport(PANIC,
    4781 ECB             :                 (errcode_for_file_access(),
    4782                 :                  errmsg("could not close bootstrap write-ahead log file: %m")));
    4783                 : 
    4784 CBC         305 :     openLogFile = -1;
    4785 ECB             : 
    4786                 :     /* Now create pg_control */
    4787 GIC         305 :     InitControlFile(sysidentifier);
    4788             305 :     ControlFile->time = checkPoint.time;
    4789             305 :     ControlFile->checkPoint = checkPoint.redo;
    4790             305 :     ControlFile->checkPointCopy = checkPoint;
    4791                 : 
    4792                 :     /* some additional ControlFile fields are set in WriteControlFile() */
    4793             305 :     WriteControlFile();
    4794 ECB             : 
    4795                 :     /* Bootstrap the commit log, too */
    4796 GIC         305 :     BootStrapCLOG();
    4797             305 :     BootStrapCommitTs();
    4798             305 :     BootStrapSUBTRANS();
    4799             305 :     BootStrapMultiXact();
    4800                 : 
    4801             305 :     pfree(buffer);
    4802                 : 
    4803                 :     /*
    4804                 :      * Force control file to be read - in contrast to normal processing we'd
    4805                 :      * otherwise never run the checks and GUC related initializations therein.
    4806                 :      */
    4807 CBC         305 :     ReadControlFile();
    4808 GIC         305 : }
    4809                 : 
    4810                 : static char *
    4811             566 : str_time(pg_time_t tnow)
    4812                 : {
    4813                 :     static char buf[128];
    4814                 : 
    4815             566 :     pg_strftime(buf, sizeof(buf),
    4816                 :                 "%Y-%m-%d %H:%M:%S %Z",
    4817             566 :                 pg_localtime(&tnow, log_timezone));
    4818                 : 
    4819             566 :     return buf;
    4820                 : }
    4821 ECB             : 
    4822                 : /*
    4823                 :  * Initialize the first WAL segment on new timeline.
    4824                 :  */
    4825                 : static void
    4826 GIC          39 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
    4827 ECB             : {
    4828                 :     char        xlogfname[MAXFNAMELEN];
    4829                 :     XLogSegNo   endLogSegNo;
    4830                 :     XLogSegNo   startLogSegNo;
    4831                 : 
    4832                 :     /* we always switch to a new timeline after archive recovery */
    4833 GIC          39 :     Assert(endTLI != newTLI);
    4834                 : 
    4835                 :     /*
    4836                 :      * Update min recovery point one last time.
    4837                 :      */
    4838 CBC          39 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    4839 ECB             : 
    4840                 :     /*
    4841                 :      * Calculate the last segment on the old timeline, and the first segment
    4842                 :      * on the new timeline. If the switch happens in the middle of a segment,
    4843                 :      * they are the same, but if the switch happens exactly at a segment
    4844                 :      * boundary, startLogSegNo will be endLogSegNo + 1.
    4845                 :      */
    4846 CBC          39 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
    4847              39 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
    4848 ECB             : 
    4849                 :     /*
    4850                 :      * Initialize the starting WAL segment for the new timeline. If the switch
    4851                 :      * happens in the middle of a segment, copy data from the last WAL segment
    4852                 :      * of the old timeline up to the switch point, to the starting WAL segment
    4853                 :      * on the new timeline.
    4854                 :      */
    4855 GIC          39 :     if (endLogSegNo == startLogSegNo)
    4856 ECB             :     {
    4857                 :         /*
    4858                 :          * Make a copy of the file on the new timeline.
    4859                 :          *
    4860                 :          * Writing WAL isn't allowed yet, so there are no locking
    4861                 :          * considerations. But we should be just as tense as XLogFileInit to
    4862                 :          * avoid emplacing a bogus file.
    4863                 :          */
    4864 GIC          30 :         XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
    4865              30 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
    4866 ECB             :     }
    4867                 :     else
    4868                 :     {
    4869                 :         /*
    4870                 :          * The switch happened at a segment boundary, so just create the next
    4871                 :          * segment on the new timeline.
    4872                 :          */
    4873                 :         int         fd;
    4874                 : 
    4875 GIC           9 :         fd = XLogFileInit(startLogSegNo, newTLI);
    4876 ECB             : 
    4877 CBC           9 :         if (close(fd) != 0)
    4878 ECB             :         {
    4879 LBC           0 :             int         save_errno = errno;
    4880 ECB             : 
    4881 LBC           0 :             XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    4882               0 :             errno = save_errno;
    4883 UIC           0 :             ereport(ERROR,
    4884 ECB             :                     (errcode_for_file_access(),
    4885                 :                      errmsg("could not close file \"%s\": %m", xlogfname)));
    4886                 :         }
    4887                 :     }
    4888                 : 
    4889                 :     /*
    4890                 :      * Let's just make real sure there are not .ready or .done flags posted
    4891                 :      * for the new segment.
    4892                 :      */
    4893 CBC          39 :     XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
    4894              39 :     XLogArchiveCleanup(xlogfname);
    4895 GIC          39 : }
    4896                 : 
    4897 ECB             : /*
    4898                 :  * Perform cleanup actions at the conclusion of archive recovery.
    4899                 :  */
    4900                 : static void
    4901 GIC          39 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
    4902                 :                             TimeLineID newTLI)
    4903                 : {
    4904                 :     /*
    4905                 :      * Execute the recovery_end_command, if any.
    4906 ECB             :      */
    4907 CBC          39 :     if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
    4908               2 :         ExecuteRecoveryCommand(recoveryEndCommand,
    4909                 :                                "recovery_end_command",
    4910                 :                                true,
    4911 EUB             :                                WAIT_EVENT_RECOVERY_END_COMMAND);
    4912                 : 
    4913                 :     /*
    4914                 :      * We switched to a new timeline. Clean up segments on the old timeline.
    4915                 :      *
    4916                 :      * If there are any higher-numbered segments on the old timeline, remove
    4917 ECB             :      * them. They might contain valid WAL, but they might also be
    4918                 :      * pre-allocated files containing garbage. In any case, they are not part
    4919                 :      * of the new timeline's history so we don't need them.
    4920                 :      */
    4921 GBC          39 :     RemoveNonParentXlogFiles(EndOfLog, newTLI);
    4922                 : 
    4923                 :     /*
    4924 ECB             :      * If the switch happened in the middle of a segment, what to do with the
    4925                 :      * last, partial segment on the old timeline? If we don't archive it, and
    4926                 :      * the server that created the WAL never archives it either (e.g. because
    4927 EUB             :      * it was hit by a meteor), it will never make it to the archive. That's
    4928                 :      * OK from our point of view, because the new segment that we created with
    4929                 :      * the new TLI contains all the WAL from the old timeline up to the switch
    4930                 :      * point. But if you later try to do PITR to the "missing" WAL on the old
    4931 ECB             :      * timeline, recovery won't find it in the archive. It's physically
    4932                 :      * present in the new file with new TLI, but recovery won't look there
    4933                 :      * when it's recovering to the older timeline. On the other hand, if we
    4934                 :      * archive the partial segment, and the original server on that timeline
    4935                 :      * is still running and archives the completed version of the same segment
    4936                 :      * later, it will fail. (We used to do that in 9.4 and below, and it
    4937                 :      * caused such problems).
    4938                 :      *
    4939                 :      * As a compromise, we rename the last segment with the .partial suffix,
    4940                 :      * and archive it. Archive recovery will never try to read .partial
    4941                 :      * segments, so they will normally go unused. But in the odd PITR case,
    4942                 :      * the administrator can copy them manually to the pg_wal directory
    4943                 :      * (removing the suffix). They can be useful in debugging, too.
    4944                 :      *
    4945                 :      * If a .done or .ready file already exists for the old timeline, however,
    4946                 :      * we had already determined that the segment is complete, so we can let
    4947                 :      * it be archived normally. (In particular, if it was restored from the
    4948                 :      * archive to begin with, it's expected to have a .done file).
    4949                 :      */
    4950 GIC          39 :     if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
    4951              30 :         XLogArchivingActive())
    4952                 :     {
    4953                 :         char        origfname[MAXFNAMELEN];
    4954 ECB             :         XLogSegNo   endLogSegNo;
    4955                 : 
    4956 GIC           6 :         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
    4957               6 :         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
    4958 ECB             : 
    4959 GIC           6 :         if (!XLogArchiveIsReadyOrDone(origfname))
    4960                 :         {
    4961                 :             char        origpath[MAXPGPATH];
    4962 ECB             :             char        partialfname[MAXFNAMELEN];
    4963                 :             char        partialpath[MAXPGPATH];
    4964                 : 
    4965 GIC           4 :             XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
    4966 CBC           4 :             snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
    4967 GIC           4 :             snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
    4968                 : 
    4969                 :             /*
    4970                 :              * Make sure there's no .done or .ready file for the .partial
    4971                 :              * file.
    4972                 :              */
    4973 CBC           4 :             XLogArchiveCleanup(partialfname);
    4974                 : 
    4975 GIC           4 :             durable_rename(origpath, partialpath, ERROR);
    4976               4 :             XLogArchiveNotify(partialfname);
    4977                 :         }
    4978                 :     }
    4979              39 : }
    4980 ECB             : 
    4981                 : /*
    4982                 :  * Check to see if required parameters are set high enough on this server
    4983                 :  * for various aspects of recovery operation.
    4984                 :  *
    4985                 :  * Note that all the parameters which this function tests need to be
    4986                 :  * listed in Administrator's Overview section in high-availability.sgml.
    4987                 :  * If you change them, don't forget to update the list.
    4988                 :  */
    4989                 : static void
    4990 GIC         170 : CheckRequiredParameterValues(void)
    4991                 : {
    4992                 :     /*
    4993 ECB             :      * For archive recovery, the WAL must be generated with at least 'replica'
    4994                 :      * wal_level.
    4995                 :      */
    4996 GIC         170 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    4997                 :     {
    4998               2 :         ereport(FATAL,
    4999                 :                 (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
    5000                 :                  errdetail("This happens if you temporarily set wal_level=minimal on the server."),
    5001                 :                  errhint("Use a backup taken after setting wal_level to higher than minimal.")));
    5002 ECB             :     }
    5003                 : 
    5004                 :     /*
    5005                 :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
    5006                 :      * must have at least as many backend slots as the primary.
    5007                 :      */
    5008 GIC         168 :     if (ArchiveRecoveryRequested && EnableHotStandby)
    5009                 :     {
    5010                 :         /* We ignore autovacuum_max_workers when we make this test. */
    5011 CBC          75 :         RecoveryRequiresIntParameter("max_connections",
    5012 ECB             :                                      MaxConnections,
    5013 GIC          75 :                                      ControlFile->MaxConnections);
    5014              75 :         RecoveryRequiresIntParameter("max_worker_processes",
    5015                 :                                      max_worker_processes,
    5016              75 :                                      ControlFile->max_worker_processes);
    5017              75 :         RecoveryRequiresIntParameter("max_wal_senders",
    5018                 :                                      max_wal_senders,
    5019              75 :                                      ControlFile->max_wal_senders);
    5020              75 :         RecoveryRequiresIntParameter("max_prepared_transactions",
    5021                 :                                      max_prepared_xacts,
    5022 CBC          75 :                                      ControlFile->max_prepared_xacts);
    5023 GIC          75 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
    5024 ECB             :                                      max_locks_per_xact,
    5025 GIC          75 :                                      ControlFile->max_locks_per_xact);
    5026 EUB             :     }
    5027 GIC         168 : }
    5028 EUB             : 
    5029                 : /*
    5030                 :  * This must be called ONCE during postmaster or standalone-backend startup
    5031                 :  */
    5032                 : void
    5033 GIC        1176 : StartupXLOG(void)
    5034                 : {
    5035                 :     XLogCtlInsert *Insert;
    5036                 :     CheckPoint  checkPoint;
    5037                 :     bool        wasShutdown;
    5038                 :     bool        didCrash;
    5039                 :     bool        haveTblspcMap;
    5040 ECB             :     bool        haveBackupLabel;
    5041                 :     XLogRecPtr  EndOfLog;
    5042                 :     TimeLineID  EndOfLogTLI;
    5043                 :     TimeLineID  newTLI;
    5044                 :     bool        performedWalRecovery;
    5045                 :     EndOfWalRecoveryInfo *endOfRecoveryInfo;
    5046                 :     XLogRecPtr  abortedRecPtr;
    5047                 :     XLogRecPtr  missingContrecPtr;
    5048                 :     TransactionId oldestActiveXID;
    5049 GIC        1176 :     bool        promoted = false;
    5050                 : 
    5051                 :     /*
    5052                 :      * We should have an aux process resource owner to use, and we should not
    5053                 :      * be in a transaction that's installed some other resowner.
    5054 ECB             :      */
    5055 CBC        1176 :     Assert(AuxProcessResourceOwner != NULL);
    5056 GIC        1176 :     Assert(CurrentResourceOwner == NULL ||
    5057                 :            CurrentResourceOwner == AuxProcessResourceOwner);
    5058            1176 :     CurrentResourceOwner = AuxProcessResourceOwner;
    5059                 : 
    5060                 :     /*
    5061                 :      * Check that contents look valid.
    5062                 :      */
    5063            1176 :     if (!XRecOffIsValid(ControlFile->checkPoint))
    5064 UIC           0 :         ereport(FATAL,
    5065                 :                 (errmsg("control file contains invalid checkpoint location")));
    5066                 : 
    5067 GIC        1176 :     switch (ControlFile->state)
    5068 ECB             :     {
    5069 GIC        1031 :         case DB_SHUTDOWNED:
    5070                 : 
    5071                 :             /*
    5072                 :              * This is the expected case, so don't be chatty in standalone
    5073                 :              * mode
    5074                 :              */
    5075            1031 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    5076                 :                     (errmsg("database system was shut down at %s",
    5077                 :                             str_time(ControlFile->time))));
    5078            1031 :             break;
    5079                 : 
    5080              14 :         case DB_SHUTDOWNED_IN_RECOVERY:
    5081              14 :             ereport(LOG,
    5082                 :                     (errmsg("database system was shut down in recovery at %s",
    5083                 :                             str_time(ControlFile->time))));
    5084              14 :             break;
    5085                 : 
    5086 UIC           0 :         case DB_SHUTDOWNING:
    5087               0 :             ereport(LOG,
    5088                 :                     (errmsg("database system shutdown was interrupted; last known up at %s",
    5089                 :                             str_time(ControlFile->time))));
    5090               0 :             break;
    5091                 : 
    5092               0 :         case DB_IN_CRASH_RECOVERY:
    5093               0 :             ereport(LOG,
    5094                 :                     (errmsg("database system was interrupted while in recovery at %s",
    5095                 :                             str_time(ControlFile->time)),
    5096                 :                      errhint("This probably means that some data is corrupted and"
    5097 ECB             :                              " you will have to use the last backup for recovery.")));
    5098 LBC           0 :             break;
    5099                 : 
    5100 GIC           4 :         case DB_IN_ARCHIVE_RECOVERY:
    5101               4 :             ereport(LOG,
    5102                 :                     (errmsg("database system was interrupted while in recovery at log time %s",
    5103 ECB             :                             str_time(ControlFile->checkPointCopy.time)),
    5104                 :                      errhint("If this has occurred more than once some data might be corrupted"
    5105                 :                              " and you might need to choose an earlier recovery target.")));
    5106 CBC           4 :             break;
    5107                 : 
    5108 GIC         127 :         case DB_IN_PRODUCTION:
    5109             127 :             ereport(LOG,
    5110                 :                     (errmsg("database system was interrupted; last known up at %s",
    5111                 :                             str_time(ControlFile->time))));
    5112 CBC         127 :             break;
    5113 ECB             : 
    5114 LBC           0 :         default:
    5115 UIC           0 :             ereport(FATAL,
    5116                 :                     (errmsg("control file contains invalid database cluster state")));
    5117                 :     }
    5118                 : 
    5119                 :     /* This is just to allow attaching to startup process with a debugger */
    5120 ECB             : #ifdef XLOG_REPLAY_DELAY
    5121                 :     if (ControlFile->state != DB_SHUTDOWNED)
    5122                 :         pg_usleep(60000000L);
    5123                 : #endif
    5124                 : 
    5125                 :     /*
    5126                 :      * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
    5127                 :      * someone has performed a copy for PITR, these directories may have been
    5128                 :      * excluded and need to be re-created.
    5129                 :      */
    5130 GIC        1176 :     ValidateXLOGDirectoryStructure();
    5131                 : 
    5132                 :     /* Set up timeout handler needed to report startup progress. */
    5133            1176 :     if (!IsBootstrapProcessingMode())
    5134             871 :         RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
    5135                 :                         startup_progress_timeout_handler);
    5136                 : 
    5137 ECB             :     /*----------
    5138                 :      * If we previously crashed, perform a couple of actions:
    5139                 :      *
    5140                 :      * - The pg_wal directory may still include some temporary WAL segments
    5141                 :      *   used when creating a new segment, so perform some clean up to not
    5142                 :      *   bloat this path.  This is done first as there is no point to sync
    5143                 :      *   this temporary data.
    5144                 :      *
    5145                 :      * - There might be data which we had written, intending to fsync it, but
    5146                 :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
    5147                 :      *   the near future might cause earlier unflushed writes to be lost, even
    5148                 :      *   though more recent data written to disk from here on would be
    5149                 :      *   persisted.  To avoid that, fsync the entire data directory.
    5150                 :      */
    5151 GIC        1176 :     if (ControlFile->state != DB_SHUTDOWNED &&
    5152             145 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
    5153                 :     {
    5154             131 :         RemoveTempXlogFiles();
    5155 CBC         131 :         SyncDataDirectory();
    5156 GIC         131 :         didCrash = true;
    5157                 :     }
    5158 ECB             :     else
    5159 GIC        1045 :         didCrash = false;
    5160 ECB             : 
    5161                 :     /*
    5162                 :      * Prepare for WAL recovery if needed.
    5163                 :      *
    5164                 :      * InitWalRecovery analyzes the control file and the backup label file, if
    5165                 :      * any.  It updates the in-memory ControlFile buffer according to the
    5166                 :      * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
    5167                 :      * It also applies the tablespace map file, if any.
    5168                 :      */
    5169 CBC        1176 :     InitWalRecovery(ControlFile, &wasShutdown,
    5170 ECB             :                     &haveBackupLabel, &haveTblspcMap);
    5171 GIC        1176 :     checkPoint = ControlFile->checkPointCopy;
    5172 ECB             : 
    5173                 :     /* initialize shared memory variables from the checkpoint record */
    5174 CBC        1176 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
    5175 GIC        1176 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
    5176            1176 :     ShmemVariableCache->oidCount = 0;
    5177            1176 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    5178            1176 :     AdvanceOldestClogXid(checkPoint.oldestXid);
    5179            1176 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    5180 CBC        1176 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
    5181 GIC        1176 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
    5182                 :                      checkPoint.newestCommitTsXid);
    5183            1176 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
    5184                 : 
    5185                 :     /*
    5186                 :      * Clear out any old relcache cache files.  This is *necessary* if we do
    5187                 :      * any WAL replay, since that would probably result in the cache files
    5188                 :      * being out of sync with database reality.  In theory we could leave them
    5189                 :      * in place if the database had been cleanly shut down, but it seems
    5190                 :      * safest to just remove them always and let them be rebuilt during the
    5191                 :      * first backend startup.  These files needs to be removed from all
    5192                 :      * directories including pg_tblspc, however the symlinks are created only
    5193                 :      * after reading tablespace_map file in case of archive recovery from
    5194                 :      * backup, so needs to clear old relcache files here after creating
    5195                 :      * symlinks.
    5196 ECB             :      */
    5197 GIC        1176 :     RelationCacheInitFileRemove();
    5198                 : 
    5199                 :     /*
    5200                 :      * Initialize replication slots, before there's a chance to remove
    5201                 :      * required resources.
    5202 ECB             :      */
    5203 CBC        1176 :     StartupReplicationSlots();
    5204                 : 
    5205 ECB             :     /*
    5206                 :      * Startup logical state, needs to be setup now so we have proper data
    5207                 :      * during crash recovery.
    5208                 :      */
    5209 GIC        1176 :     StartupReorderBuffer();
    5210 ECB             : 
    5211 EUB             :     /*
    5212                 :      * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
    5213                 :      * been initialized and before we accept connections or begin WAL replay.
    5214 ECB             :      */
    5215 GIC        1176 :     StartupCLOG();
    5216 ECB             : 
    5217                 :     /*
    5218                 :      * Startup MultiXact. We need to do this early to be able to replay
    5219                 :      * truncations.
    5220                 :      */
    5221 GIC        1176 :     StartupMultiXact();
    5222 ECB             : 
    5223                 :     /*
    5224                 :      * Ditto for commit timestamps.  Activate the facility if the setting is
    5225                 :      * enabled in the control file, as there should be no tracking of commit
    5226                 :      * timestamps done when the setting was disabled.  This facility can be
    5227                 :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
    5228                 :      */
    5229 GIC        1176 :     if (ControlFile->track_commit_timestamp)
    5230               8 :         StartupCommitTs();
    5231 ECB             : 
    5232                 :     /*
    5233 EUB             :      * Recover knowledge about replay progress of known replication partners.
    5234                 :      */
    5235 GIC        1176 :     StartupReplicationOrigin();
    5236                 : 
    5237 EUB             :     /*
    5238                 :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
    5239                 :      * control file. On recovery, all unlogged relations are blown away, so
    5240                 :      * the unlogged LSN counter can be reset too.
    5241                 :      */
    5242 GIC        1176 :     if (ControlFile->state == DB_SHUTDOWNED)
    5243            1025 :         XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
    5244                 :     else
    5245 GBC         151 :         XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
    5246                 : 
    5247 ECB             :     /*
    5248                 :      * Copy any missing timeline history files between 'now' and the recovery
    5249                 :      * target timeline from archive to pg_wal. While we don't need those files
    5250                 :      * ourselves - the history file of the recovery target timeline covers all
    5251                 :      * the previous timelines in the history too - a cascading standby server
    5252                 :      * might be interested in them. Or, if you archive the WAL from this
    5253                 :      * server to a different archive than the primary, it'd be good for all
    5254                 :      * the history files to get archived there after failover, so that you can
    5255                 :      * use one of the old timelines as a PITR target. Timeline history files
    5256                 :      * are small, so it's better to copy them unnecessarily than not copy them
    5257                 :      * and regret later.
    5258                 :      */
    5259 CBC        1176 :     restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
    5260                 : 
    5261 EUB             :     /*
    5262                 :      * Before running in recovery, scan pg_twophase and fill in its status to
    5263                 :      * be able to work on entries generated by redo.  Doing a scan before
    5264                 :      * taking any recovery action has the merit to discard any 2PC files that
    5265                 :      * are newer than the first record to replay, saving from any conflicts at
    5266                 :      * replay.  This avoids as well any subsequent scans when doing recovery
    5267                 :      * of the on-disk two-phase data.
    5268                 :      */
    5269 GIC        1176 :     restoreTwoPhaseData();
    5270                 : 
    5271                 :     /*
    5272                 :      * When starting with crash recovery, reset pgstat data - it might not be
    5273                 :      * valid. Otherwise restore pgstat data. It's safe to do this here,
    5274                 :      * because postmaster will not yet have started any other processes.
    5275                 :      *
    5276                 :      * NB: Restoring replication slot stats relies on slot state to have
    5277 ECB             :      * already been restored from disk.
    5278                 :      *
    5279                 :      * TODO: With a bit of extra work we could just start with a pgstat file
    5280                 :      * associated with the checkpoint redo location we're starting from.
    5281                 :      */
    5282 GIC        1176 :     if (didCrash)
    5283             131 :         pgstat_discard_stats();
    5284                 :     else
    5285            1045 :         pgstat_restore_stats();
    5286                 : 
    5287            1176 :     lastFullPageWrites = checkPoint.fullPageWrites;
    5288                 : 
    5289            1176 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    5290            1176 :     doPageWrites = lastFullPageWrites;
    5291                 : 
    5292                 :     /* REDO */
    5293            1176 :     if (InRecovery)
    5294                 :     {
    5295                 :         /* Initialize state for RecoveryInProgress() */
    5296             151 :         SpinLockAcquire(&XLogCtl->info_lck);
    5297             151 :         if (InArchiveRecovery)
    5298 CBC          73 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    5299 ECB             :         else
    5300 GIC          78 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
    5301 CBC         151 :         SpinLockRelease(&XLogCtl->info_lck);
    5302 ECB             : 
    5303                 :         /*
    5304                 :          * Update pg_control to show that we are recovering and to show the
    5305                 :          * selected checkpoint as the place we are starting from. We also mark
    5306                 :          * pg_control with any minimum recovery stop point obtained from a
    5307                 :          * backup history file.
    5308                 :          *
    5309                 :          * No need to hold ControlFileLock yet, we aren't up far enough.
    5310                 :          */
    5311 GIC         151 :         UpdateControlFile();
    5312                 : 
    5313                 :         /*
    5314                 :          * If there was a backup label file, it's done its job and the info
    5315                 :          * has now been propagated into pg_control.  We must get rid of the
    5316 ECB             :          * label file so that if we crash during recovery, we'll pick up at
    5317                 :          * the latest recovery restartpoint instead of going all the way back
    5318                 :          * to the backup start point.  It seems prudent though to just rename
    5319                 :          * the file out of the way rather than delete it completely.
    5320                 :          */
    5321 CBC         151 :         if (haveBackupLabel)
    5322 ECB             :         {
    5323 CBC          51 :             unlink(BACKUP_LABEL_OLD);
    5324              51 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
    5325 ECB             :         }
    5326                 : 
    5327                 :         /*
    5328                 :          * If there was a tablespace_map file, it's done its job and the
    5329                 :          * symlinks have been created.  We must get rid of the map file so
    5330                 :          * that if we crash during recovery, we don't create symlinks again.
    5331                 :          * It seems prudent though to just rename the file out of the way
    5332                 :          * rather than delete it completely.
    5333                 :          */
    5334 GIC         151 :         if (haveTblspcMap)
    5335                 :         {
    5336               1 :             unlink(TABLESPACE_MAP_OLD);
    5337               1 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
    5338                 :         }
    5339                 : 
    5340                 :         /*
    5341                 :          * Initialize our local copy of minRecoveryPoint.  When doing crash
    5342                 :          * recovery we want to replay up to the end of WAL.  Particularly, in
    5343                 :          * the case of a promoted standby minRecoveryPoint value in the
    5344 ECB             :          * control file is only updated after the first checkpoint.  However,
    5345                 :          * if the instance crashes before the first post-recovery checkpoint
    5346                 :          * is completed then recovery will use a stale location causing the
    5347                 :          * startup process to think that there are still invalid page
    5348                 :          * references when checking for data consistency.
    5349                 :          */
    5350 CBC         151 :         if (InArchiveRecovery)
    5351                 :         {
    5352 GIC          73 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    5353              73 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    5354                 :         }
    5355                 :         else
    5356 ECB             :         {
    5357 GIC          78 :             LocalMinRecoveryPoint = InvalidXLogRecPtr;
    5358              78 :             LocalMinRecoveryPointTLI = 0;
    5359                 :         }
    5360                 : 
    5361                 :         /* Check that the GUCs used to generate the WAL allow recovery */
    5362 CBC         151 :         CheckRequiredParameterValues();
    5363                 : 
    5364                 :         /*
    5365                 :          * We're in recovery, so unlogged relations may be trashed and must be
    5366                 :          * reset.  This should be done BEFORE allowing Hot Standby
    5367                 :          * connections, so that read-only backends don't try to read whatever
    5368 ECB             :          * garbage is left over from before.
    5369                 :          */
    5370 GIC         151 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
    5371                 : 
    5372                 :         /*
    5373                 :          * Likewise, delete any saved transaction snapshot files that got left
    5374                 :          * behind by crashed backends.
    5375                 :          */
    5376 CBC         151 :         DeleteAllExportedSnapshotFiles();
    5377 ECB             : 
    5378                 :         /*
    5379                 :          * Initialize for Hot Standby, if enabled. We won't let backends in
    5380                 :          * yet, not until we've reached the min recovery point specified in
    5381                 :          * control file and we've established a recovery snapshot from a
    5382                 :          * running-xacts WAL record.
    5383                 :          */
    5384 GIC         151 :         if (ArchiveRecoveryRequested && EnableHotStandby)
    5385                 :         {
    5386                 :             TransactionId *xids;
    5387                 :             int         nxids;
    5388                 : 
    5389 CBC          71 :             ereport(DEBUG1,
    5390 ECB             :                     (errmsg_internal("initializing for hot standby")));
    5391                 : 
    5392 CBC          71 :             InitRecoveryTransactionEnvironment();
    5393                 : 
    5394 GIC          71 :             if (wasShutdown)
    5395              12 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    5396                 :             else
    5397              59 :                 oldestActiveXID = checkPoint.oldestActiveXid;
    5398              71 :             Assert(TransactionIdIsValid(oldestActiveXID));
    5399                 : 
    5400                 :             /* Tell procarray about the range of xids it has to deal with */
    5401              71 :             ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
    5402                 : 
    5403                 :             /*
    5404                 :              * Startup subtrans only.  CLOG, MultiXact and commit timestamp
    5405                 :              * have already been started up and other SLRUs are not maintained
    5406 ECB             :              * during recovery and need not be started yet.
    5407                 :              */
    5408 GIC          71 :             StartupSUBTRANS(oldestActiveXID);
    5409                 : 
    5410                 :             /*
    5411                 :              * If we're beginning at a shutdown checkpoint, we know that
    5412                 :              * nothing was running on the primary at this point. So fake-up an
    5413                 :              * empty running-xacts record and use that here and now. Recover
    5414                 :              * additional standby state for prepared transactions.
    5415                 :              */
    5416 CBC          71 :             if (wasShutdown)
    5417                 :             {
    5418                 :                 RunningTransactionsData running;
    5419                 :                 TransactionId latestCompletedXid;
    5420                 : 
    5421                 :                 /*
    5422                 :                  * Construct a RunningTransactions snapshot representing a
    5423                 :                  * shut down server, with only prepared transactions still
    5424                 :                  * alive. We're never overflowed at this point because all
    5425                 :                  * subxids are listed with their parent prepared transactions.
    5426                 :                  */
    5427 GIC          12 :                 running.xcnt = nxids;
    5428              12 :                 running.subxcnt = 0;
    5429 CBC          12 :                 running.subxid_overflow = false;
    5430              12 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    5431 GIC          12 :                 running.oldestRunningXid = oldestActiveXID;
    5432 CBC          12 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    5433 GIC          12 :                 TransactionIdRetreat(latestCompletedXid);
    5434 CBC          12 :                 Assert(TransactionIdIsNormal(latestCompletedXid));
    5435 GIC          12 :                 running.latestCompletedXid = latestCompletedXid;
    5436 CBC          12 :                 running.xids = xids;
    5437 ECB             : 
    5438 GIC          12 :                 ProcArrayApplyRecoveryInfo(&running);
    5439                 : 
    5440 CBC          12 :                 StandbyRecoverPreparedTransactions();
    5441                 :             }
    5442                 :         }
    5443 ECB             : 
    5444                 :         /*
    5445                 :          * We're all set for replaying the WAL now. Do it.
    5446                 :          */
    5447 CBC         151 :         PerformWalRecovery();
    5448             117 :         performedWalRecovery = true;
    5449                 :     }
    5450                 :     else
    5451 GIC        1025 :         performedWalRecovery = false;
    5452                 : 
    5453                 :     /*
    5454                 :      * Finish WAL recovery.
    5455                 :      */
    5456            1142 :     endOfRecoveryInfo = FinishWalRecovery();
    5457            1142 :     EndOfLog = endOfRecoveryInfo->endOfLog;
    5458 CBC        1142 :     EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
    5459 GIC        1142 :     abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
    5460            1142 :     missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
    5461                 : 
    5462                 :     /*
    5463                 :      * Reset ps status display, so as no information related to recovery
    5464                 :      * shows up.
    5465                 :      */
    5466            1142 :     set_ps_display("");
    5467                 : 
    5468 ECB             :     /*
    5469                 :      * When recovering from a backup (we are in recovery, and archive recovery
    5470                 :      * was requested), complain if we did not roll forward far enough to reach
    5471                 :      * the point where the database is consistent.  For regular online
    5472                 :      * backup-from-primary, that means reaching the end-of-backup WAL record
    5473                 :      * (at which point we reset backupStartPoint to be Invalid), for
    5474                 :      * backup-from-replica (which can't inject records into the WAL stream),
    5475                 :      * that point is when we reach the minRecoveryPoint in pg_control (which
    5476                 :      * we purposefully copy last when backing up from a replica).  For
    5477                 :      * pg_rewind (which creates a backup_label with a method of "pg_rewind")
    5478                 :      * or snapshot-style backups (which don't), backupEndRequired will be set
    5479                 :      * to false.
    5480                 :      *
    5481                 :      * Note: it is indeed okay to look at the local variable
    5482                 :      * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
    5483                 :      * might be further ahead --- ControlFile->minRecoveryPoint cannot have
    5484                 :      * been advanced beyond the WAL we processed.
    5485                 :      */
    5486 GIC        1142 :     if (InRecovery &&
    5487             117 :         (EndOfLog < LocalMinRecoveryPoint ||
    5488             117 :          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    5489                 :     {
    5490                 :         /*
    5491                 :          * Ran off end of WAL before reaching end-of-backup WAL record, or
    5492                 :          * minRecoveryPoint. That's a bad sign, indicating that you tried to
    5493                 :          * recover from an online backup but never called pg_backup_stop(), or
    5494                 :          * you didn't archive all the WAL needed.
    5495                 :          */
    5496 UIC           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
    5497 ECB             :         {
    5498 UIC           0 :             if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
    5499 LBC           0 :                 ereport(FATAL,
    5500 ECB             :                         (errmsg("WAL ends before end of online backup"),
    5501                 :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
    5502                 :             else
    5503 UIC           0 :                 ereport(FATAL,
    5504 ECB             :                         (errmsg("WAL ends before consistent recovery point")));
    5505                 :         }
    5506                 :     }
    5507                 : 
    5508                 :     /*
    5509                 :      * Reset unlogged relations to the contents of their INIT fork. This is
    5510                 :      * done AFTER recovery is complete so as to include any unlogged relations
    5511                 :      * created during recovery, but BEFORE recovery is marked as having
    5512                 :      * completed successfully. Otherwise we'd not retry if any of the post
    5513                 :      * end-of-recovery steps fail.
    5514                 :      */
    5515 GIC        1142 :     if (InRecovery)
    5516             117 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    5517 ECB             : 
    5518                 :     /*
    5519                 :      * Pre-scan prepared transactions to find out the range of XIDs present.
    5520                 :      * This information is not quite needed yet, but it is positioned here so
    5521                 :      * as potential problems are detected before any on-disk change is done.
    5522                 :      */
    5523 CBC        1142 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
    5524                 : 
    5525                 :     /*
    5526                 :      * Allow ordinary WAL segment creation before possibly switching to a new
    5527                 :      * timeline, which creates a new segment, and after the last ReadRecord().
    5528                 :      */
    5529 GNC        1142 :     SetInstallXLogFileSegmentActive();
    5530                 : 
    5531                 :     /*
    5532                 :      * Consider whether we need to assign a new timeline ID.
    5533                 :      *
    5534 ECB             :      * If we did archive recovery, we always assign a new ID.  This handles a
    5535                 :      * couple of issues.  If we stopped short of the end of WAL during
    5536                 :      * recovery, then we are clearly generating a new timeline and must assign
    5537                 :      * it a unique new ID.  Even if we ran to the end, modifying the current
    5538                 :      * last segment is problematic because it may result in trying to
    5539                 :      * overwrite an already-archived copy of that segment, and we encourage
    5540                 :      * DBAs to make their archive_commands reject that.  We can dodge the
    5541                 :      * problem by making the new active segment have a new timeline ID.
    5542                 :      *
    5543                 :      * In a normal crash recovery, we can just extend the timeline we were in.
    5544                 :      */
    5545 GIC        1142 :     newTLI = endOfRecoveryInfo->lastRecTLI;
    5546 CBC        1142 :     if (ArchiveRecoveryRequested)
    5547                 :     {
    5548 GIC          39 :         newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
    5549              39 :         ereport(LOG,
    5550                 :                 (errmsg("selected new timeline ID: %u", newTLI)));
    5551                 : 
    5552                 :         /*
    5553 ECB             :          * Make a writable copy of the last WAL segment.  (Note that we also
    5554                 :          * have a copy of the last block of the old WAL in
    5555                 :          * endOfRecovery->lastPage; we will use that below.)
    5556                 :          */
    5557 GIC          39 :         XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
    5558                 : 
    5559                 :         /*
    5560                 :          * Remove the signal files out of the way, so that we don't
    5561 ECB             :          * accidentally re-enter archive recovery mode in a subsequent crash.
    5562                 :          */
    5563 GIC          39 :         if (endOfRecoveryInfo->standby_signal_file_found)
    5564              36 :             durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
    5565                 : 
    5566              39 :         if (endOfRecoveryInfo->recovery_signal_file_found)
    5567               3 :             durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
    5568                 : 
    5569                 :         /*
    5570                 :          * Write the timeline history file, and have it archived. After this
    5571                 :          * point (or rather, as soon as the file is archived), the timeline
    5572 ECB             :          * will appear as "taken" in the WAL archive and to any standby
    5573                 :          * servers.  If we crash before actually switching to the new
    5574                 :          * timeline, standby servers will nevertheless think that we switched
    5575                 :          * to the new timeline, and will try to connect to the new timeline.
    5576                 :          * To minimize the window for that, try to do as little as possible
    5577                 :          * between here and writing the end-of-recovery record.
    5578                 :          */
    5579 CBC          39 :         writeTimeLineHistory(newTLI, recoveryTargetTLI,
    5580 ECB             :                              EndOfLog, endOfRecoveryInfo->recoveryStopReason);
    5581                 : 
    5582 GIC          39 :         ereport(LOG,
    5583 ECB             :                 (errmsg("archive recovery complete")));
    5584                 :     }
    5585                 : 
    5586                 :     /* Save the selected TimeLineID in shared memory, too */
    5587 GIC        1142 :     XLogCtl->InsertTimeLineID = newTLI;
    5588            1142 :     XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
    5589                 : 
    5590                 :     /*
    5591                 :      * Actually, if WAL ended in an incomplete record, skip the parts that
    5592 ECB             :      * made it through and start writing after the portion that persisted.
    5593                 :      * (It's critical to first write an OVERWRITE_CONTRECORD message, which
    5594                 :      * we'll do as soon as we're open for writing new WAL.)
    5595                 :      */
    5596 CBC        1142 :     if (!XLogRecPtrIsInvalid(missingContrecPtr))
    5597                 :     {
    5598                 :         /*
    5599                 :          * We should only have a missingContrecPtr if we're not switching to
    5600                 :          * a new timeline. When a timeline switch occurs, WAL is copied from
    5601 ECB             :          * the old timeline to the new only up to the end of the last complete
    5602                 :          * record, so there can't be an incomplete WAL record that we need to
    5603                 :          * disregard.
    5604                 :          */
    5605 CBC           1 :         Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
    5606 GIC           1 :         Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
    5607               1 :         EndOfLog = missingContrecPtr;
    5608                 :     }
    5609                 : 
    5610                 :     /*
    5611 ECB             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
    5612                 :      * buffer cache using the block containing the last record from the
    5613                 :      * previous incarnation.
    5614                 :      */
    5615 GIC        1142 :     Insert = &XLogCtl->Insert;
    5616            1142 :     Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
    5617            1142 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    5618                 : 
    5619                 :     /*
    5620                 :      * Tricky point here: lastPage contains the *last* block that the LastRec
    5621                 :      * record spans, not the one it starts in.  The last block is indeed the
    5622                 :      * one we want to use.
    5623                 :      */
    5624            1142 :     if (EndOfLog % XLOG_BLCKSZ != 0)
    5625                 :     {
    5626                 :         char       *page;
    5627                 :         int         len;
    5628                 :         int         firstIdx;
    5629                 : 
    5630            1128 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
    5631 CBC        1128 :         len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
    5632            1128 :         Assert(len < XLOG_BLCKSZ);
    5633 ECB             : 
    5634                 :         /* Copy the valid part of the last block, and zero the rest */
    5635 GIC        1128 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
    5636            1128 :         memcpy(page, endOfRecoveryInfo->lastPage, len);
    5637            1128 :         memset(page + len, 0, XLOG_BLCKSZ - len);
    5638                 : 
    5639            1128 :         XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
    5640            1128 :         XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
    5641 EUB             :     }
    5642                 :     else
    5643                 :     {
    5644                 :         /*
    5645                 :          * There is no partial block to copy. Just set InitializedUpTo, and
    5646                 :          * let the first attempt to insert a log record to initialize the next
    5647                 :          * buffer.
    5648                 :          */
    5649 GIC          14 :         XLogCtl->InitializedUpTo = EndOfLog;
    5650                 :     }
    5651                 : 
    5652            1142 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
    5653                 : 
    5654            1142 :     XLogCtl->LogwrtResult = LogwrtResult;
    5655                 : 
    5656            1142 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
    5657            1142 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
    5658                 : 
    5659                 :     /*
    5660 ECB             :      * Preallocate additional log files, if wanted.
    5661                 :      */
    5662 GIC        1142 :     PreallocXlogFiles(EndOfLog, newTLI);
    5663                 : 
    5664                 :     /*
    5665                 :      * Okay, we're officially UP.
    5666                 :      */
    5667            1142 :     InRecovery = false;
    5668 ECB             : 
    5669                 :     /* start the archive_timeout timer and LSN running */
    5670 GIC        1142 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
    5671            1142 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
    5672                 : 
    5673                 :     /* also initialize latestCompletedXid, to nextXid - 1 */
    5674 CBC        1142 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    5675 GIC        1142 :     ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
    5676            1142 :     FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
    5677            1142 :     LWLockRelease(ProcArrayLock);
    5678                 : 
    5679                 :     /*
    5680                 :      * Start up subtrans, if not already done for hot standby.  (commit
    5681                 :      * timestamps are started below, if necessary.)
    5682                 :      */
    5683            1142 :     if (standbyState == STANDBY_DISABLED)
    5684            1103 :         StartupSUBTRANS(oldestActiveXID);
    5685                 : 
    5686                 :     /*
    5687                 :      * Perform end of recovery actions for any SLRUs that need it.
    5688                 :      */
    5689            1142 :     TrimCLOG();
    5690 CBC        1142 :     TrimMultiXact();
    5691 ECB             : 
    5692                 :     /* Reload shared-memory state for prepared transactions */
    5693 CBC        1142 :     RecoverPreparedTransactions();
    5694 ECB             : 
    5695                 :     /* Shut down xlogreader */
    5696 GIC        1142 :     ShutdownWalRecovery();
    5697                 : 
    5698                 :     /* Enable WAL writes for this backend only. */
    5699            1142 :     LocalSetXLogInsertAllowed();
    5700                 : 
    5701                 :     /* If necessary, write overwrite-contrecord before doing anything else */
    5702 CBC        1142 :     if (!XLogRecPtrIsInvalid(abortedRecPtr))
    5703                 :     {
    5704 GIC           1 :         Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
    5705               1 :         CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
    5706                 :     }
    5707                 : 
    5708 ECB             :     /*
    5709                 :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
    5710                 :      * record before resource manager writes cleanup WAL records or checkpoint
    5711                 :      * record is written.
    5712                 :      */
    5713 GIC        1142 :     Insert->fullPageWrites = lastFullPageWrites;
    5714            1142 :     UpdateFullPageWrites();
    5715                 : 
    5716                 :     /*
    5717                 :      * Emit checkpoint or end-of-recovery record in XLOG, if required.
    5718                 :      */
    5719            1142 :     if (performedWalRecovery)
    5720             117 :         promoted = PerformRecoveryXLogAction();
    5721                 : 
    5722                 :     /*
    5723                 :      * If any of the critical GUCs have changed, log them before we allow
    5724 ECB             :      * backends to write WAL.
    5725                 :      */
    5726 GIC        1142 :     XLogReportParameters();
    5727 ECB             : 
    5728                 :     /* If this is archive recovery, perform post-recovery cleanup actions. */
    5729 GIC        1142 :     if (ArchiveRecoveryRequested)
    5730              39 :         CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
    5731                 : 
    5732 ECB             :     /*
    5733                 :      * Local WAL inserts enabled, so it's time to finish initialization of
    5734                 :      * commit timestamp.
    5735                 :      */
    5736 GIC        1142 :     CompleteCommitTsInitialization();
    5737                 : 
    5738                 :     /*
    5739                 :      * All done with end-of-recovery actions.
    5740                 :      *
    5741 ECB             :      * Now allow backends to write WAL and update the control file status in
    5742                 :      * consequence.  SharedRecoveryState, that controls if backends can write
    5743                 :      * WAL, is updated while holding ControlFileLock to prevent other backends
    5744                 :      * to look at an inconsistent state of the control file in shared memory.
    5745                 :      * There is still a small window during which backends can write WAL and
    5746                 :      * the control file is still referring to a system not in DB_IN_PRODUCTION
    5747                 :      * state while looking at the on-disk control file.
    5748                 :      *
    5749                 :      * Also, we use info_lck to update SharedRecoveryState to ensure that
    5750                 :      * there are no race conditions concerning visibility of other recent
    5751                 :      * updates to shared memory.
    5752                 :      */
    5753 GIC        1142 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    5754            1142 :     ControlFile->state = DB_IN_PRODUCTION;
    5755                 : 
    5756            1142 :     SpinLockAcquire(&XLogCtl->info_lck);
    5757            1142 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
    5758            1142 :     SpinLockRelease(&XLogCtl->info_lck);
    5759                 : 
    5760 CBC        1142 :     UpdateControlFile();
    5761            1142 :     LWLockRelease(ControlFileLock);
    5762 ECB             : 
    5763                 :     /*
    5764                 :      * Shutdown the recovery environment.  This must occur after
    5765                 :      * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
    5766                 :      * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
    5767                 :      * any session building a snapshot will not rely on KnownAssignedXids as
    5768                 :      * RecoveryInProgress() would return false at this stage.  This is
    5769                 :      * particularly critical for prepared 2PC transactions, that would still
    5770                 :      * need to be included in snapshots once recovery has ended.
    5771                 :      */
    5772 GIC        1142 :     if (standbyState != STANDBY_DISABLED)
    5773              39 :         ShutdownRecoveryTransactionEnvironment();
    5774                 : 
    5775 ECB             :     /*
    5776                 :      * If there were cascading standby servers connected to us, nudge any wal
    5777                 :      * sender processes to notice that we've been promoted.
    5778                 :      */
    5779 GNC        1142 :     WalSndWakeup(true, true);
    5780 ECB             : 
    5781                 :     /*
    5782                 :      * If this was a promotion, request an (online) checkpoint now. This isn't
    5783                 :      * required for consistency, but the last restartpoint might be far back,
    5784                 :      * and in case of a crash, recovering from it might take a longer than is
    5785                 :      * appropriate now that we're not in standby mode anymore.
    5786                 :      */
    5787 GIC        1142 :     if (promoted)
    5788              36 :         RequestCheckpoint(CHECKPOINT_FORCE);
    5789            1142 : }
    5790                 : 
    5791                 : /*
    5792                 :  * Callback from PerformWalRecovery(), called when we switch from crash
    5793                 :  * recovery to archive recovery mode.  Updates the control file accordingly.
    5794 ECB             :  */
    5795                 : void
    5796 GIC           2 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
    5797 ECB             : {
    5798                 :     /* initialize minRecoveryPoint to this record */
    5799 CBC           2 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    5800 GIC           2 :     ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    5801 CBC           2 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    5802 ECB             :     {
    5803 GIC           2 :         ControlFile->minRecoveryPoint = EndRecPtr;
    5804               2 :         ControlFile->minRecoveryPointTLI = replayTLI;
    5805                 :     }
    5806                 :     /* update local copy */
    5807 CBC           2 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    5808 GIC           2 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    5809                 : 
    5810                 :     /*
    5811                 :      * The startup process can update its local copy of minRecoveryPoint from
    5812 ECB             :      * this point.
    5813                 :      */
    5814 GIC           2 :     updateMinRecoveryPoint = true;
    5815 ECB             : 
    5816 CBC           2 :     UpdateControlFile();
    5817                 : 
    5818                 :     /*
    5819 ECB             :      * We update SharedRecoveryState while holding the lock on ControlFileLock
    5820                 :      * so both states are consistent in shared memory.
    5821                 :      */
    5822 CBC           2 :     SpinLockAcquire(&XLogCtl->info_lck);
    5823 GIC           2 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
    5824               2 :     SpinLockRelease(&XLogCtl->info_lck);
    5825                 : 
    5826               2 :     LWLockRelease(ControlFileLock);
    5827               2 : }
    5828 ECB             : 
    5829                 : /*
    5830                 :  * Callback from PerformWalRecovery(), called when we reach the end of backup.
    5831                 :  * Updates the control file accordingly.
    5832                 :  */
    5833                 : void
    5834 CBC          51 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
    5835 ECB             : {
    5836                 :     /*
    5837                 :      * We have reached the end of base backup, as indicated by pg_control. The
    5838                 :      * data on disk is now consistent (unless minRecovery point is further
    5839                 :      * ahead, which can happen if we crashed during previous recovery).  Reset
    5840                 :      * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
    5841                 :      * make sure we don't allow starting up at an earlier point even if
    5842                 :      * recovery is stopped and restarted soon after this.
    5843                 :      */
    5844 CBC          51 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    5845                 : 
    5846 GIC          51 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
    5847 ECB             :     {
    5848 GIC          49 :         ControlFile->minRecoveryPoint = EndRecPtr;
    5849 CBC          49 :         ControlFile->minRecoveryPointTLI = tli;
    5850 ECB             :     }
    5851                 : 
    5852 GIC          51 :     ControlFile->backupStartPoint = InvalidXLogRecPtr;
    5853              51 :     ControlFile->backupEndPoint = InvalidXLogRecPtr;
    5854              51 :     ControlFile->backupEndRequired = false;
    5855              51 :     UpdateControlFile();
    5856                 : 
    5857              51 :     LWLockRelease(ControlFileLock);
    5858 CBC          51 : }
    5859 ECB             : 
    5860                 : /*
    5861                 :  * Perform whatever XLOG actions are necessary at end of REDO.
    5862                 :  *
    5863                 :  * The goal here is to make sure that we'll be able to recover properly if
    5864                 :  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
    5865                 :  * checkpoint rather than an on-line one. This is not particularly critical,
    5866                 :  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
    5867                 :  * us to have the rule that TLI only changes in shutdown checkpoints, which
    5868                 :  * allows some extra error checking in xlog_redo.
    5869                 :  */
    5870                 : static bool
    5871 CBC         117 : PerformRecoveryXLogAction(void)
    5872                 : {
    5873 GIC         117 :     bool        promoted = false;
    5874 ECB             : 
    5875                 :     /*
    5876                 :      * Perform a checkpoint to update all our recovery activity to disk.
    5877                 :      *
    5878                 :      * Note that we write a shutdown checkpoint rather than an on-line one.
    5879                 :      * This is not particularly critical, but since we may be assigning a new
    5880                 :      * TLI, using a shutdown checkpoint allows us to have the rule that TLI
    5881                 :      * only changes in shutdown checkpoints, which allows some extra error
    5882                 :      * checking in xlog_redo.
    5883                 :      *
    5884                 :      * In promotion, only create a lightweight end-of-recovery record instead
    5885                 :      * of a full checkpoint. A checkpoint is requested later, after we're
    5886                 :      * fully out of recovery mode and already accepting queries.
    5887                 :      */
    5888 GIC         156 :     if (ArchiveRecoveryRequested && IsUnderPostmaster &&
    5889              39 :         PromoteIsTriggered())
    5890                 :     {
    5891              36 :         promoted = true;
    5892                 : 
    5893                 :         /*
    5894                 :          * Insert a special WAL record to mark the end of recovery, since we
    5895                 :          * aren't doing a checkpoint. That means that the checkpointer process
    5896                 :          * may likely be in the middle of a time-smoothed restartpoint and
    5897                 :          * could continue to be for minutes after this.  That sounds strange,
    5898 ECB             :          * but the effect is roughly the same and it would be stranger to try
    5899                 :          * to come out of the restartpoint and then checkpoint. We request a
    5900                 :          * checkpoint later anyway, just for safety.
    5901                 :          */
    5902 CBC          36 :         CreateEndOfRecoveryRecord();
    5903 ECB             :     }
    5904                 :     else
    5905                 :     {
    5906 CBC          81 :         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
    5907                 :                           CHECKPOINT_IMMEDIATE |
    5908                 :                           CHECKPOINT_WAIT);
    5909                 :     }
    5910                 : 
    5911 GIC         117 :     return promoted;
    5912                 : }
    5913                 : 
    5914                 : /*
    5915                 :  * Is the system still in recovery?
    5916                 :  *
    5917 ECB             :  * Unlike testing InRecovery, this works in any process that's connected to
    5918                 :  * shared memory.
    5919                 :  */
    5920                 : bool
    5921 GIC    81142281 : RecoveryInProgress(void)
    5922                 : {
    5923                 :     /*
    5924 ECB             :      * We check shared state each time only until we leave recovery mode. We
    5925                 :      * can't re-enter recovery, so there's no need to keep checking after the
    5926                 :      * shared variable has once been seen false.
    5927                 :      */
    5928 GIC    81142281 :     if (!LocalRecoveryInProgress)
    5929        77601222 :         return false;
    5930                 :     else
    5931                 :     {
    5932 ECB             :         /*
    5933                 :          * use volatile pointer to make sure we make a fresh read of the
    5934                 :          * shared variable.
    5935                 :          */
    5936 GIC     3541059 :         volatile XLogCtlData *xlogctl = XLogCtl;
    5937                 : 
    5938         3541059 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
    5939                 : 
    5940                 :         /*
    5941 ECB             :          * Note: We don't need a memory barrier when we're still in recovery.
    5942                 :          * We might exit recovery immediately after return, so the caller
    5943                 :          * can't rely on 'true' meaning that we're still in recovery anyway.
    5944                 :          */
    5945                 : 
    5946 CBC     3541059 :         return LocalRecoveryInProgress;
    5947                 :     }
    5948 ECB             : }
    5949                 : 
    5950                 : /*
    5951                 :  * Returns current recovery state from shared memory.
    5952                 :  *
    5953                 :  * This returned state is kept consistent with the contents of the control
    5954                 :  * file.  See details about the possible values of RecoveryState in xlog.h.
    5955                 :  */
    5956                 : RecoveryState
    5957 GIC          25 : GetRecoveryState(void)
    5958                 : {
    5959 ECB             :     RecoveryState retval;
    5960                 : 
    5961 CBC          25 :     SpinLockAcquire(&XLogCtl->info_lck);
    5962 GIC          25 :     retval = XLogCtl->SharedRecoveryState;
    5963              25 :     SpinLockRelease(&XLogCtl->info_lck);
    5964                 : 
    5965              25 :     return retval;
    5966                 : }
    5967 ECB             : 
    5968                 : /*
    5969                 :  * Is this process allowed to insert new WAL records?
    5970                 :  *
    5971                 :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
    5972                 :  * But we also have provisions for forcing the result "true" or "false"
    5973                 :  * within specific processes regardless of the global state.
    5974                 :  */
    5975                 : bool
    5976 GIC    43544110 : XLogInsertAllowed(void)
    5977                 : {
    5978                 :     /*
    5979 ECB             :      * If value is "unconditionally true" or "unconditionally false", just
    5980                 :      * return it.  This provides the normal fast path once recovery is known
    5981                 :      * done.
    5982                 :      */
    5983 GIC    43544110 :     if (LocalXLogInsertAllowed >= 0)
    5984        43451207 :         return (bool) LocalXLogInsertAllowed;
    5985                 : 
    5986                 :     /*
    5987                 :      * Else, must check to see if we're still in recovery.
    5988                 :      */
    5989 CBC       92903 :     if (RecoveryInProgress())
    5990 GIC       87067 :         return false;
    5991 ECB             : 
    5992                 :     /*
    5993                 :      * On exit from recovery, reset to "unconditionally true", since there is
    5994                 :      * no need to keep checking.
    5995                 :      */
    5996 GIC        5836 :     LocalXLogInsertAllowed = 1;
    5997 CBC        5836 :     return true;
    5998 ECB             : }
    5999                 : 
    6000                 : /*
    6001                 :  * Make XLogInsertAllowed() return true in the current process only.
    6002                 :  *
    6003                 :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
    6004                 :  * and even call LocalSetXLogInsertAllowed() again after that.
    6005                 :  *
    6006                 :  * Returns the previous value of LocalXLogInsertAllowed.
    6007                 :  */
    6008                 : static int
    6009 GIC        1169 : LocalSetXLogInsertAllowed(void)
    6010                 : {
    6011            1169 :     int         oldXLogAllowed = LocalXLogInsertAllowed;
    6012                 : 
    6013            1169 :     LocalXLogInsertAllowed = 1;
    6014                 : 
    6015            1169 :     return oldXLogAllowed;
    6016 ECB             : }
    6017                 : 
    6018                 : /*
    6019                 :  * Return the current Redo pointer from shared memory.
    6020                 :  *
    6021                 :  * As a side-effect, the local RedoRecPtr copy is updated.
    6022                 :  */
    6023                 : XLogRecPtr
    6024 GIC      152956 : GetRedoRecPtr(void)
    6025                 : {
    6026                 :     XLogRecPtr  ptr;
    6027                 : 
    6028                 :     /*
    6029                 :      * The possibly not up-to-date copy in XlogCtl is enough. Even if we
    6030                 :      * grabbed a WAL insertion lock to read the authoritative value in
    6031                 :      * Insert->RedoRecPtr, someone might update it just after we've released
    6032                 :      * the lock.
    6033 ECB             :      */
    6034 CBC      152956 :     SpinLockAcquire(&XLogCtl->info_lck);
    6035 GIC      152956 :     ptr = XLogCtl->RedoRecPtr;
    6036 CBC      152956 :     SpinLockRelease(&XLogCtl->info_lck);
    6037                 : 
    6038 GIC      152956 :     if (RedoRecPtr < ptr)
    6039             742 :         RedoRecPtr = ptr;
    6040                 : 
    6041          152956 :     return RedoRecPtr;
    6042                 : }
    6043                 : 
    6044                 : /*
    6045                 :  * Return information needed to decide whether a modified block needs a
    6046                 :  * full-page image to be included in the WAL record.
    6047 ECB             :  *
    6048                 :  * The returned values are cached copies from backend-private memory, and
    6049                 :  * possibly out-of-date or, indeed, uninitialized, in which case they will
    6050                 :  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
    6051                 :  * re-check them against up-to-date values, while holding the WAL insert lock.
    6052                 :  */
    6053                 : void
    6054 GIC    19592404 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
    6055                 : {
    6056 CBC    19592404 :     *RedoRecPtr_p = RedoRecPtr;
    6057 GIC    19592404 :     *doPageWrites_p = doPageWrites;
    6058        19592404 : }
    6059                 : 
    6060                 : /*
    6061                 :  * GetInsertRecPtr -- Returns the current insert position.
    6062                 :  *
    6063                 :  * NOTE: The value *actually* returned is the position of the last full
    6064                 :  * xlog page. It lags behind the real insert position by at most 1 page.
    6065                 :  * For that, we don't need to scan through WAL insertion locks, and an
    6066 ECB             :  * approximation is enough for the current usage of this function.
    6067                 :  */
    6068                 : XLogRecPtr
    6069 GIC        2424 : GetInsertRecPtr(void)
    6070                 : {
    6071                 :     XLogRecPtr  recptr;
    6072                 : 
    6073 CBC        2424 :     SpinLockAcquire(&XLogCtl->info_lck);
    6074            2424 :     recptr = XLogCtl->LogwrtRqst.Write;
    6075 GIC        2424 :     SpinLockRelease(&XLogCtl->info_lck);
    6076                 : 
    6077            2424 :     return recptr;
    6078                 : }
    6079                 : 
    6080                 : /*
    6081 ECB             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
    6082                 :  * position known to be fsync'd to disk. This should only be used on a
    6083                 :  * system that is known not to be in recovery.
    6084                 :  */
    6085                 : XLogRecPtr
    6086 GIC      147978 : GetFlushRecPtr(TimeLineID *insertTLI)
    6087                 : {
    6088          147978 :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    6089                 : 
    6090          147978 :     SpinLockAcquire(&XLogCtl->info_lck);
    6091 CBC      147978 :     LogwrtResult = XLogCtl->LogwrtResult;
    6092 GIC      147978 :     SpinLockRelease(&XLogCtl->info_lck);
    6093                 : 
    6094                 :     /*
    6095                 :      * If we're writing and flushing WAL, the time line can't be changing, so
    6096                 :      * no lock is required.
    6097                 :      */
    6098          147978 :     if (insertTLI)
    6099           24724 :         *insertTLI = XLogCtl->InsertTimeLineID;
    6100                 : 
    6101          147978 :     return LogwrtResult.Flush;
    6102 ECB             : }
    6103                 : 
    6104                 : /*
    6105                 :  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
    6106                 :  * is not in recovery.
    6107                 :  */
    6108                 : TimeLineID
    6109 GIC       10349 : GetWALInsertionTimeLine(void)
    6110 ECB             : {
    6111 GIC       10349 :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
    6112                 : 
    6113                 :     /* Since the value can't be changing, no lock is required. */
    6114           10349 :     return XLogCtl->InsertTimeLineID;
    6115                 : }
    6116                 : 
    6117                 : /*
    6118                 :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
    6119                 :  * inserted. All records not explicitly marked as unimportant are considered
    6120                 :  * important.
    6121 ECB             :  *
    6122                 :  * The LSN is determined by computing the maximum of
    6123                 :  * WALInsertLocks[i].lastImportantAt.
    6124                 :  */
    6125                 : XLogRecPtr
    6126 GIC        2386 : GetLastImportantRecPtr(void)
    6127                 : {
    6128 CBC        2386 :     XLogRecPtr  res = InvalidXLogRecPtr;
    6129 ECB             :     int         i;
    6130                 : 
    6131 GIC       21474 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
    6132                 :     {
    6133                 :         XLogRecPtr  last_important;
    6134 ECB             : 
    6135                 :         /*
    6136                 :          * Need to take a lock to prevent torn reads of the LSN, which are
    6137                 :          * possible on some of the supported platforms. WAL insert locks only
    6138                 :          * support exclusive mode, so we have to use that.
    6139                 :          */
    6140 GIC       19088 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
    6141 CBC       19088 :         last_important = WALInsertLocks[i].l.lastImportantAt;
    6142           19088 :         LWLockRelease(&WALInsertLocks[i].l.lock);
    6143                 : 
    6144 GIC       19088 :         if (res < last_important)
    6145            2619 :             res = last_important;
    6146                 :     }
    6147                 : 
    6148            2386 :     return res;
    6149                 : }
    6150                 : 
    6151                 : /*
    6152                 :  * Get the time and LSN of the last xlog segment switch
    6153                 :  */
    6154 ECB             : pg_time_t
    6155 UIC           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
    6156 ECB             : {
    6157                 :     pg_time_t   result;
    6158                 : 
    6159                 :     /* Need WALWriteLock, but shared lock is sufficient */
    6160 LBC           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
    6161 UIC           0 :     result = XLogCtl->lastSegSwitchTime;
    6162               0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
    6163               0 :     LWLockRelease(WALWriteLock);
    6164                 : 
    6165               0 :     return result;
    6166                 : }
    6167                 : 
    6168                 : /*
    6169 ECB             :  * This must be called ONCE during postmaster or standalone-backend shutdown
    6170                 :  */
    6171                 : void
    6172 GIC         971 : ShutdownXLOG(int code, Datum arg)
    6173                 : {
    6174                 :     /*
    6175                 :      * We should have an aux process resource owner to use, and we should not
    6176                 :      * be in a transaction that's installed some other resowner.
    6177                 :      */
    6178             971 :     Assert(AuxProcessResourceOwner != NULL);
    6179 CBC         971 :     Assert(CurrentResourceOwner == NULL ||
    6180 ECB             :            CurrentResourceOwner == AuxProcessResourceOwner);
    6181 CBC         971 :     CurrentResourceOwner = AuxProcessResourceOwner;
    6182                 : 
    6183 ECB             :     /* Don't be chatty in standalone mode */
    6184 CBC         971 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
    6185                 :             (errmsg("shutting down")));
    6186 ECB             : 
    6187                 :     /*
    6188                 :      * Signal walsenders to move to stopping state.
    6189                 :      */
    6190 GIC         971 :     WalSndInitStopping();
    6191                 : 
    6192                 :     /*
    6193                 :      * Wait for WAL senders to be in stopping state.  This prevents commands
    6194                 :      * from writing new WAL.
    6195                 :      */
    6196             971 :     WalSndWaitStopping();
    6197                 : 
    6198             971 :     if (RecoveryInProgress())
    6199 CBC          31 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    6200                 :     else
    6201 ECB             :     {
    6202                 :         /*
    6203                 :          * If archiving is enabled, rotate the last XLOG file so that all the
    6204                 :          * remaining records are archived (postmaster wakes up the archiver
    6205                 :          * process one more time at the end of shutdown). The checkpoint
    6206                 :          * record will go to the next XLOG file and won't be archived (yet).
    6207                 :          */
    6208 GIC         940 :         if (XLogArchivingActive())
    6209               9 :             RequestXLogSwitch(false);
    6210                 : 
    6211             940 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    6212                 :     }
    6213             971 : }
    6214 ECB             : 
    6215                 : /*
    6216                 :  * Log start of a checkpoint.
    6217                 :  */
    6218                 : static void
    6219 CBC         541 : LogCheckpointStart(int flags, bool restartpoint)
    6220 ECB             : {
    6221 GIC         541 :     if (restartpoint)
    6222 CBC          28 :         ereport(LOG,
    6223                 :         /* translator: the placeholders show checkpoint options */
    6224                 :                 (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
    6225                 :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    6226                 :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    6227                 :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    6228                 :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
    6229                 :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
    6230                 :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    6231 ECB             :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    6232                 :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
    6233                 :     else
    6234 GIC         513 :         ereport(LOG,
    6235 ECB             :         /* translator: the placeholders show checkpoint options */
    6236                 :                 (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
    6237                 :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
    6238                 :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
    6239                 :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
    6240                 :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
    6241                 :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
    6242                 :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
    6243                 :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
    6244                 :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
    6245 GIC         541 : }
    6246 ECB             : 
    6247                 : /*
    6248                 :  * Log end of a checkpoint.
    6249                 :  */
    6250                 : static void
    6251 GIC        2363 : LogCheckpointEnd(bool restartpoint)
    6252                 : {
    6253                 :     long        write_msecs,
    6254 ECB             :                 sync_msecs,
    6255                 :                 total_msecs,
    6256                 :                 longest_msecs,
    6257                 :                 average_msecs;
    6258                 :     uint64      average_sync_time;
    6259                 : 
    6260 GIC        2363 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
    6261                 : 
    6262            2363 :     write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
    6263                 :                                                   CheckpointStats.ckpt_sync_t);
    6264                 : 
    6265            2363 :     sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
    6266                 :                                                  CheckpointStats.ckpt_sync_end_t);
    6267                 : 
    6268                 :     /* Accumulate checkpoint timing summary data, in milliseconds. */
    6269            2363 :     PendingCheckpointerStats.checkpoint_write_time += write_msecs;
    6270            2363 :     PendingCheckpointerStats.checkpoint_sync_time += sync_msecs;
    6271 ECB             : 
    6272                 :     /*
    6273                 :      * All of the published timing statistics are accounted for.  Only
    6274                 :      * continue if a log message is to be written.
    6275                 :      */
    6276 CBC        2363 :     if (!log_checkpoints)
    6277 GIC        1822 :         return;
    6278                 : 
    6279             541 :     total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
    6280                 :                                                   CheckpointStats.ckpt_end_t);
    6281                 : 
    6282                 :     /*
    6283                 :      * Timing values returned from CheckpointStats are in microseconds.
    6284                 :      * Convert to milliseconds for consistent printing.
    6285 ECB             :      */
    6286 CBC         541 :     longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
    6287 ECB             : 
    6288 GIC         541 :     average_sync_time = 0;
    6289 CBC         541 :     if (CheckpointStats.ckpt_sync_rels > 0)
    6290 LBC           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
    6291 UIC           0 :             CheckpointStats.ckpt_sync_rels;
    6292 GIC         541 :     average_msecs = (long) ((average_sync_time + 999) / 1000);
    6293 ECB             : 
    6294                 :     /*
    6295                 :      * ControlFileLock is not required to see ControlFile->checkPoint and
    6296                 :      * ->checkPointCopy here as we are the only updator of those variables at
    6297                 :      * this moment.
    6298                 :      */
    6299 GIC         541 :     if (restartpoint)
    6300              28 :         ereport(LOG,
    6301                 :                 (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
    6302                 :                         "%d WAL file(s) added, %d removed, %d recycled; "
    6303                 :                         "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
    6304                 :                         "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
    6305                 :                         "distance=%d kB, estimate=%d kB; "
    6306                 :                         "lsn=%X/%X, redo lsn=%X/%X",
    6307                 :                         CheckpointStats.ckpt_bufs_written,
    6308                 :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    6309                 :                         CheckpointStats.ckpt_segs_added,
    6310                 :                         CheckpointStats.ckpt_segs_removed,
    6311 EUB             :                         CheckpointStats.ckpt_segs_recycled,
    6312                 :                         write_msecs / 1000, (int) (write_msecs % 1000),
    6313                 :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    6314                 :                         total_msecs / 1000, (int) (total_msecs % 1000),
    6315                 :                         CheckpointStats.ckpt_sync_rels,
    6316                 :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    6317                 :                         average_msecs / 1000, (int) (average_msecs % 1000),
    6318                 :                         (int) (PrevCheckPointDistance / 1024.0),
    6319                 :                         (int) (CheckPointDistanceEstimate / 1024.0),
    6320                 :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    6321                 :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    6322                 :     else
    6323 GIC         513 :         ereport(LOG,
    6324                 :                 (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
    6325 ECB             :                         "%d WAL file(s) added, %d removed, %d recycled; "
    6326                 :                         "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
    6327                 :                         "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
    6328                 :                         "distance=%d kB, estimate=%d kB; "
    6329                 :                         "lsn=%X/%X, redo lsn=%X/%X",
    6330                 :                         CheckpointStats.ckpt_bufs_written,
    6331                 :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
    6332                 :                         CheckpointStats.ckpt_segs_added,
    6333                 :                         CheckpointStats.ckpt_segs_removed,
    6334                 :                         CheckpointStats.ckpt_segs_recycled,
    6335                 :                         write_msecs / 1000, (int) (write_msecs % 1000),
    6336                 :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
    6337                 :                         total_msecs / 1000, (int) (total_msecs % 1000),
    6338                 :                         CheckpointStats.ckpt_sync_rels,
    6339                 :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
    6340                 :                         average_msecs / 1000, (int) (average_msecs % 1000),
    6341                 :                         (int) (PrevCheckPointDistance / 1024.0),
    6342                 :                         (int) (CheckPointDistanceEstimate / 1024.0),
    6343                 :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
    6344                 :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
    6345                 : }
    6346                 : 
    6347                 : /*
    6348                 :  * Update the estimate of distance between checkpoints.
    6349                 :  *
    6350                 :  * The estimate is used to calculate the number of WAL segments to keep
    6351                 :  * preallocated, see XLOGfileslop().
    6352                 :  */
    6353                 : static void
    6354 CBC        2363 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
    6355 ECB             : {
    6356                 :     /*
    6357                 :      * To estimate the number of segments consumed between checkpoints, keep a
    6358                 :      * moving average of the amount of WAL generated in previous checkpoint
    6359                 :      * cycles. However, if the load is bursty, with quiet periods and busy
    6360                 :      * periods, we want to cater for the peak load. So instead of a plain
    6361                 :      * moving average, let the average decline slowly if the previous cycle
    6362                 :      * used less WAL than estimated, but bump it up immediately if it used
    6363                 :      * more.
    6364                 :      *
    6365                 :      * When checkpoints are triggered by max_wal_size, this should converge to
    6366                 :      * CheckpointSegments * wal_segment_size,
    6367                 :      *
    6368                 :      * Note: This doesn't pay any attention to what caused the checkpoint.
    6369                 :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
    6370                 :      * starting a base backup, are counted the same as those created
    6371                 :      * automatically. The slow-decline will largely mask them out, if they are
    6372                 :      * not frequent. If they are frequent, it seems reasonable to count them
    6373                 :      * in as any others; if you issue a manual checkpoint every 5 minutes and
    6374                 :      * never let a timed checkpoint happen, it makes sense to base the
    6375                 :      * preallocation on that 5 minute interval rather than whatever
    6376                 :      * checkpoint_timeout is set to.
    6377                 :      */
    6378 CBC        2363 :     PrevCheckPointDistance = nbytes;
    6379 GIC        2363 :     if (CheckPointDistanceEstimate < nbytes)
    6380            1015 :         CheckPointDistanceEstimate = nbytes;
    6381                 :     else
    6382            1348 :         CheckPointDistanceEstimate =
    6383            1348 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
    6384            2363 : }
    6385                 : 
    6386                 : /*
    6387                 :  * Update the ps display for a process running a checkpoint.  Note that
    6388                 :  * this routine should not do any allocations so as it can be called
    6389                 :  * from a critical section.
    6390 ECB             :  */
    6391                 : static void
    6392 GIC        4726 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
    6393                 : {
    6394                 :     /*
    6395                 :      * The status is reported only for end-of-recovery and shutdown
    6396                 :      * checkpoints or shutdown restartpoints.  Updating the ps display is
    6397                 :      * useful in those situations as it may not be possible to rely on
    6398                 :      * pg_stat_activity to see the status of the checkpointer or the startup
    6399                 :      * process.
    6400                 :      */
    6401 CBC        4726 :     if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
    6402 GIC        2766 :         return;
    6403                 : 
    6404            1960 :     if (reset)
    6405             980 :         set_ps_display("");
    6406                 :     else
    6407 ECB             :     {
    6408                 :         char        activitymsg[128];
    6409                 : 
    6410 GIC        2940 :         snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
    6411             980 :                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
    6412             980 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
    6413                 :                  restartpoint ? "restartpoint" : "checkpoint");
    6414             980 :         set_ps_display(activitymsg);
    6415                 :     }
    6416 ECB             : }
    6417                 : 
    6418                 : 
    6419                 : /*
    6420                 :  * Perform a checkpoint --- either during shutdown, or on-the-fly
    6421                 :  *
    6422                 :  * flags is a bitwise OR of the following:
    6423                 :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
    6424                 :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
    6425                 :  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
    6426                 :  *      ignoring checkpoint_completion_target parameter.
    6427                 :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
    6428                 :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
    6429                 :  *      CHECKPOINT_END_OF_RECOVERY).
    6430                 :  *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
    6431                 :  *
    6432                 :  * Note: flags contains other bits, of interest here only for logging purposes.
    6433                 :  * In particular note that this routine is synchronous and does not pay
    6434                 :  * attention to CHECKPOINT_WAIT.
    6435                 :  *
    6436                 :  * If !shutdown then we are writing an online checkpoint. This is a very special
    6437                 :  * kind of operation and WAL record because the checkpoint action occurs over
    6438                 :  * a period of time yet logically occurs at just a single LSN. The logical
    6439                 :  * position of the WAL record (redo ptr) is the same or earlier than the
    6440                 :  * physical position. When we replay WAL we locate the checkpoint via its
    6441                 :  * physical position then read the redo ptr and actually start replay at the
    6442                 :  * earlier logical position. Note that we don't write *anything* to WAL at
    6443                 :  * the logical position, so that location could be any other kind of WAL record.
    6444                 :  * All of this mechanism allows us to continue working while we checkpoint.
    6445                 :  * As a result, timing of actions is critical here and be careful to note that
    6446 EUB             :  * this function will likely take minutes to execute on a busy system.
    6447                 :  */
    6448 ECB             : void
    6449 GIC        2340 : CreateCheckPoint(int flags)
    6450                 : {
    6451                 :     bool        shutdown;
    6452                 :     CheckPoint  checkPoint;
    6453                 :     XLogRecPtr  recptr;
    6454                 :     XLogSegNo   _logSegNo;
    6455 CBC        2340 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    6456 ECB             :     uint32      freespace;
    6457                 :     XLogRecPtr  PriorRedoPtr;
    6458                 :     XLogRecPtr  curInsert;
    6459                 :     XLogRecPtr  last_important_lsn;
    6460                 :     VirtualTransactionId *vxids;
    6461                 :     int         nvxids;
    6462 GIC        2340 :     int         oldXLogAllowed = 0;
    6463                 : 
    6464                 :     /*
    6465                 :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
    6466                 :      * issued at a different time.
    6467                 :      */
    6468            2340 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
    6469             967 :         shutdown = true;
    6470                 :     else
    6471            1373 :         shutdown = false;
    6472                 : 
    6473                 :     /* sanity check */
    6474            2340 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
    6475 UIC           0 :         elog(ERROR, "can't create a checkpoint during recovery");
    6476                 : 
    6477                 :     /*
    6478                 :      * Prepare to accumulate statistics.
    6479 ECB             :      *
    6480                 :      * Note: because it is possible for log_checkpoints to change while a
    6481                 :      * checkpoint proceeds, we always accumulate stats, even if
    6482                 :      * log_checkpoints is currently off.
    6483                 :      */
    6484 GIC       25740 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    6485            2340 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    6486                 : 
    6487                 :     /*
    6488                 :      * Let smgr prepare for checkpoint; this has to happen outside the
    6489                 :      * critical section and before we determine the REDO pointer.  Note that
    6490                 :      * smgr must not do anything that'd have to be undone if we decide no
    6491                 :      * checkpoint is needed.
    6492                 :      */
    6493            2340 :     SyncPreCheckpoint();
    6494                 : 
    6495                 :     /*
    6496                 :      * Use a critical section to force system panic if we have trouble.
    6497                 :      */
    6498            2340 :     START_CRIT_SECTION();
    6499                 : 
    6500            2340 :     if (shutdown)
    6501                 :     {
    6502             967 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6503             967 :         ControlFile->state = DB_SHUTDOWNING;
    6504             967 :         UpdateControlFile();
    6505             967 :         LWLockRelease(ControlFileLock);
    6506                 :     }
    6507                 : 
    6508                 :     /* Begin filling in the checkpoint WAL record */
    6509           28080 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
    6510 CBC        2340 :     checkPoint.time = (pg_time_t) time(NULL);
    6511                 : 
    6512                 :     /*
    6513                 :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
    6514                 :      * pointer. This allows us to begin accumulating changes to assemble our
    6515                 :      * starting snapshot of locks and transactions.
    6516                 :      */
    6517 GIC        2340 :     if (!shutdown && XLogStandbyInfoActive())
    6518            1353 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    6519                 :     else
    6520             987 :         checkPoint.oldestActiveXid = InvalidTransactionId;
    6521                 : 
    6522                 :     /*
    6523                 :      * Get location of last important record before acquiring insert locks (as
    6524                 :      * GetLastImportantRecPtr() also locks WAL locks).
    6525                 :      */
    6526            2340 :     last_important_lsn = GetLastImportantRecPtr();
    6527                 : 
    6528                 :     /*
    6529                 :      * We must block concurrent insertions while examining insert state to
    6530                 :      * determine the checkpoint REDO pointer.
    6531                 :      */
    6532            2340 :     WALInsertLockAcquireExclusive();
    6533            2340 :     curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
    6534 ECB             : 
    6535                 :     /*
    6536                 :      * If this isn't a shutdown or forced checkpoint, and if there has been no
    6537                 :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
    6538                 :      * avoid inserting duplicate checkpoints when the system is idle.
    6539                 :      */
    6540 CBC        2340 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    6541                 :                   CHECKPOINT_FORCE)) == 0)
    6542                 :     {
    6543 GIC          28 :         if (last_important_lsn == ControlFile->checkPoint)
    6544                 :         {
    6545               5 :             WALInsertLockRelease();
    6546               5 :             END_CRIT_SECTION();
    6547               5 :             ereport(DEBUG1,
    6548 ECB             :                     (errmsg_internal("checkpoint skipped because system is idle")));
    6549 GIC           5 :             return;
    6550                 :         }
    6551                 :     }
    6552                 : 
    6553                 :     /*
    6554                 :      * An end-of-recovery checkpoint is created before anyone is allowed to
    6555                 :      * write WAL. To allow us to write the checkpoint record, temporarily
    6556                 :      * enable XLogInsertAllowed.
    6557 ECB             :      */
    6558 CBC        2335 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    6559 GIC          27 :         oldXLogAllowed = LocalSetXLogInsertAllowed();
    6560 ECB             : 
    6561 CBC        2335 :     checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    6562 GIC        2335 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
    6563              27 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    6564                 :     else
    6565            2308 :         checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
    6566 ECB             : 
    6567 CBC        2335 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
    6568 ECB             : 
    6569                 :     /*
    6570                 :      * Compute new REDO record ptr = location of next XLOG record.
    6571                 :      *
    6572                 :      * NB: this is NOT necessarily where the checkpoint record itself will be,
    6573                 :      * since other backends may insert more XLOG records while we're off doing
    6574                 :      * the buffer flush work.  Those XLOG records are logically after the
    6575                 :      * checkpoint, even though physically before it.  Got that?
    6576                 :      */
    6577 GIC        2335 :     freespace = INSERT_FREESPACE(curInsert);
    6578            2335 :     if (freespace == 0)
    6579                 :     {
    6580 UIC           0 :         if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
    6581               0 :             curInsert += SizeOfXLogLongPHD;
    6582                 :         else
    6583               0 :             curInsert += SizeOfXLogShortPHD;
    6584                 :     }
    6585 GIC        2335 :     checkPoint.redo = curInsert;
    6586                 : 
    6587                 :     /*
    6588                 :      * Here we update the shared RedoRecPtr for future XLogInsert calls; this
    6589                 :      * must be done while holding all the insertion locks.
    6590                 :      *
    6591                 :      * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
    6592                 :      * pointing past where it really needs to point.  This is okay; the only
    6593                 :      * consequence is that XLogInsert might back up whole buffers that it
    6594                 :      * didn't really need to.  We can't postpone advancing RedoRecPtr because
    6595                 :      * XLogInserts that happen while we are dumping buffers must assume that
    6596                 :      * their buffer changes are not included in the checkpoint.
    6597                 :      */
    6598            2335 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
    6599                 : 
    6600                 :     /*
    6601                 :      * Now we can release the WAL insertion locks, allowing other xacts to
    6602                 :      * proceed while we are flushing disk buffers.
    6603                 :      */
    6604            2335 :     WALInsertLockRelease();
    6605 ECB             : 
    6606                 :     /* Update the info_lck-protected copy of RedoRecPtr as well */
    6607 GIC        2335 :     SpinLockAcquire(&XLogCtl->info_lck);
    6608            2335 :     XLogCtl->RedoRecPtr = checkPoint.redo;
    6609            2335 :     SpinLockRelease(&XLogCtl->info_lck);
    6610                 : 
    6611 ECB             :     /*
    6612                 :      * If enabled, log checkpoint start.  We postpone this until now so as not
    6613                 :      * to log anything if we decided to skip the checkpoint.
    6614                 :      */
    6615 GIC        2335 :     if (log_checkpoints)
    6616             513 :         LogCheckpointStart(flags, false);
    6617                 : 
    6618 ECB             :     /* Update the process title */
    6619 GIC        2335 :     update_checkpoint_display(flags, false, false);
    6620                 : 
    6621                 :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
    6622                 : 
    6623                 :     /*
    6624 ECB             :      * Get the other info we need for the checkpoint record.
    6625                 :      *
    6626                 :      * We don't need to save oldestClogXid in the checkpoint, it only matters
    6627                 :      * for the short period in which clog is being truncated, and if we crash
    6628                 :      * during that we'll redo the clog truncation and fix up oldestClogXid
    6629                 :      * there.
    6630                 :      */
    6631 GBC        2335 :     LWLockAcquire(XidGenLock, LW_SHARED);
    6632 GIC        2335 :     checkPoint.nextXid = ShmemVariableCache->nextXid;
    6633            2335 :     checkPoint.oldestXid = ShmemVariableCache->oldestXid;
    6634            2335 :     checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
    6635            2335 :     LWLockRelease(XidGenLock);
    6636                 : 
    6637            2335 :     LWLockAcquire(CommitTsLock, LW_SHARED);
    6638            2335 :     checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
    6639            2335 :     checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
    6640 CBC        2335 :     LWLockRelease(CommitTsLock);
    6641 ECB             : 
    6642 GIC        2335 :     LWLockAcquire(OidGenLock, LW_SHARED);
    6643            2335 :     checkPoint.nextOid = ShmemVariableCache->nextOid;
    6644            2335 :     if (!shutdown)
    6645            1368 :         checkPoint.nextOid += ShmemVariableCache->oidCount;
    6646            2335 :     LWLockRelease(OidGenLock);
    6647                 : 
    6648            2335 :     MultiXactGetCheckptMulti(shutdown,
    6649 ECB             :                              &checkPoint.nextMulti,
    6650                 :                              &checkPoint.nextMultiOffset,
    6651                 :                              &checkPoint.oldestMulti,
    6652                 :                              &checkPoint.oldestMultiDB);
    6653                 : 
    6654                 :     /*
    6655                 :      * Having constructed the checkpoint record, ensure all shmem disk buffers
    6656                 :      * and commit-log buffers are flushed to disk.
    6657                 :      *
    6658                 :      * This I/O could fail for various reasons.  If so, we will fail to
    6659                 :      * complete the checkpoint, but there is no reason to force a system
    6660                 :      * panic. Accordingly, exit critical section while doing it.
    6661                 :      */
    6662 GIC        2335 :     END_CRIT_SECTION();
    6663                 : 
    6664                 :     /*
    6665 ECB             :      * In some cases there are groups of actions that must all occur on one
    6666                 :      * side or the other of a checkpoint record. Before flushing the
    6667                 :      * checkpoint record we must explicitly wait for any backend currently
    6668                 :      * performing those groups of actions.
    6669                 :      *
    6670                 :      * One example is end of transaction, so we must wait for any transactions
    6671                 :      * that are currently in commit critical sections.  If an xact inserted
    6672                 :      * its commit record into XLOG just before the REDO point, then a crash
    6673                 :      * restart from the REDO point would not replay that record, which means
    6674                 :      * that our flushing had better include the xact's update of pg_xact.  So
    6675                 :      * we wait till he's out of his commit critical section before proceeding.
    6676                 :      * See notes in RecordTransactionCommit().
    6677                 :      *
    6678                 :      * Because we've already released the insertion locks, this test is a bit
    6679                 :      * fuzzy: it is possible that we will wait for xacts we didn't really need
    6680                 :      * to wait for.  But the delay should be short and it seems better to make
    6681                 :      * checkpoint take a bit longer than to hold off insertions longer than
    6682                 :      * necessary. (In fact, the whole reason we have this issue is that xact.c
    6683                 :      * does commit record XLOG insertion and clog update as two separate steps
    6684                 :      * protected by different locks, but again that seems best on grounds of
    6685                 :      * minimizing lock contention.)
    6686                 :      *
    6687                 :      * A transaction that has not yet set delayChkptFlags when we look cannot
    6688                 :      * be at risk, since it has not inserted its commit record yet; and one
    6689                 :      * that's already cleared it is not at risk either, since it's done fixing
    6690                 :      * clog and we will correctly flush the update below.  So we cannot miss
    6691                 :      * any xacts we need to wait for.
    6692                 :      */
    6693 GIC        2335 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
    6694            2335 :     if (nvxids > 0)
    6695                 :     {
    6696 ECB             :         do
    6697                 :         {
    6698 GIC           9 :             pg_usleep(10000L);  /* wait for 10 msec */
    6699 CBC           9 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    6700                 :                                               DELAY_CHKPT_START));
    6701 ECB             :     }
    6702 CBC        2335 :     pfree(vxids);
    6703 ECB             : 
    6704 GIC        2335 :     CheckPointGuts(checkPoint.redo, flags);
    6705 ECB             : 
    6706 GIC        2335 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
    6707            2335 :     if (nvxids > 0)
    6708                 :     {
    6709                 :         do
    6710                 :         {
    6711 UIC           0 :             pg_usleep(10000L);  /* wait for 10 msec */
    6712               0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
    6713                 :                                               DELAY_CHKPT_COMPLETE));
    6714 ECB             :     }
    6715 CBC        2335 :     pfree(vxids);
    6716                 : 
    6717 ECB             :     /*
    6718                 :      * Take a snapshot of running transactions and write this to WAL. This
    6719                 :      * allows us to reconstruct the state of running transactions during
    6720                 :      * archive recovery, if required. Skip, if this info disabled.
    6721                 :      *
    6722                 :      * If we are shutting down, or Startup process is completing crash
    6723                 :      * recovery we don't need to write running xact data.
    6724                 :      */
    6725 GIC        2335 :     if (!shutdown && XLogStandbyInfoActive())
    6726            1348 :         LogStandbySnapshot();
    6727                 : 
    6728            2335 :     START_CRIT_SECTION();
    6729                 : 
    6730                 :     /*
    6731                 :      * Now insert the checkpoint record into XLOG.
    6732                 :      */
    6733 CBC        2335 :     XLogBeginInsert();
    6734            2335 :     XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
    6735 GIC        2335 :     recptr = XLogInsert(RM_XLOG_ID,
    6736 EUB             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
    6737                 :                         XLOG_CHECKPOINT_ONLINE);
    6738                 : 
    6739 GBC        2335 :     XLogFlush(recptr);
    6740                 : 
    6741 ECB             :     /*
    6742                 :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
    6743                 :      * overwritten at next startup.  No-one should even try, this just allows
    6744                 :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
    6745                 :      * to just temporarily disable writing until the system has exited
    6746                 :      * recovery.
    6747                 :      */
    6748 GIC        2335 :     if (shutdown)
    6749                 :     {
    6750             967 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
    6751              27 :             LocalXLogInsertAllowed = oldXLogAllowed;
    6752                 :         else
    6753             940 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
    6754 ECB             :     }
    6755                 : 
    6756                 :     /*
    6757                 :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
    6758                 :      * = end of actual checkpoint record.
    6759                 :      */
    6760 CBC        2335 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
    6761 UIC           0 :         ereport(PANIC,
    6762                 :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
    6763 ECB             : 
    6764                 :     /*
    6765                 :      * Remember the prior checkpoint's redo ptr for
    6766                 :      * UpdateCheckPointDistanceEstimate()
    6767                 :      */
    6768 GIC        2335 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    6769                 : 
    6770                 :     /*
    6771 ECB             :      * Update the control file.
    6772                 :      */
    6773 GIC        2335 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6774            2335 :     if (shutdown)
    6775 CBC         967 :         ControlFile->state = DB_SHUTDOWNED;
    6776 GIC        2335 :     ControlFile->checkPoint = ProcLastRecPtr;
    6777            2335 :     ControlFile->checkPointCopy = checkPoint;
    6778                 :     /* crash recovery should always recover to the end of WAL */
    6779            2335 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    6780            2335 :     ControlFile->minRecoveryPointTLI = 0;
    6781                 : 
    6782                 :     /*
    6783                 :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
    6784                 :      * unused on non-shutdown checkpoints, but seems useful to store it always
    6785                 :      * for debugging purposes.
    6786                 :      */
    6787 CBC        2335 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
    6788            2335 :     ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
    6789            2335 :     SpinLockRelease(&XLogCtl->ulsn_lck);
    6790 ECB             : 
    6791 CBC        2335 :     UpdateControlFile();
    6792 GIC        2335 :     LWLockRelease(ControlFileLock);
    6793 ECB             : 
    6794                 :     /* Update shared-memory copy of checkpoint XID/epoch */
    6795 CBC        2335 :     SpinLockAcquire(&XLogCtl->info_lck);
    6796            2335 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
    6797 GIC        2335 :     SpinLockRelease(&XLogCtl->info_lck);
    6798 ECB             : 
    6799                 :     /*
    6800                 :      * We are now done with critical updates; no need for system panic if we
    6801                 :      * have trouble while fooling with old log segments.
    6802                 :      */
    6803 GIC        2335 :     END_CRIT_SECTION();
    6804 ECB             : 
    6805                 :     /*
    6806                 :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
    6807                 :      */
    6808 GIC        2335 :     SyncPostCheckpoint();
    6809                 : 
    6810                 :     /*
    6811                 :      * Update the average distance between checkpoints if the prior checkpoint
    6812                 :      * exists.
    6813                 :      */
    6814            2335 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    6815            2335 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    6816                 : 
    6817                 :     /*
    6818 ECB             :      * Delete old log files, those no longer needed for last checkpoint to
    6819                 :      * prevent the disk holding the xlog from growing full.
    6820                 :      */
    6821 GIC        2335 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    6822            2335 :     KeepLogSeg(recptr, &_logSegNo);
    6823 GNC        2335 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
    6824                 :                                            _logSegNo, InvalidOid,
    6825                 :                                            InvalidTransactionId))
    6826                 :     {
    6827                 :         /*
    6828                 :          * Some slots have been invalidated; recalculate the old-segment
    6829                 :          * horizon, starting again from RedoRecPtr.
    6830                 :          */
    6831 GIC           3 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    6832               3 :         KeepLogSeg(recptr, &_logSegNo);
    6833                 :     }
    6834            2335 :     _logSegNo--;
    6835            2335 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
    6836                 :                        checkPoint.ThisTimeLineID);
    6837                 : 
    6838                 :     /*
    6839                 :      * Make more log segments if needed.  (Do this after recycling old log
    6840                 :      * segments, since that may supply some of the needed files.)
    6841                 :      */
    6842            2335 :     if (!shutdown)
    6843            1368 :         PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
    6844                 : 
    6845                 :     /*
    6846                 :      * Truncate pg_subtrans if possible.  We can throw away all data before
    6847                 :      * the oldest XMIN of any running transaction.  No future transaction will
    6848                 :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    6849                 :      * in subtrans.c).  During recovery, though, we mustn't do this because
    6850                 :      * StartupSUBTRANS hasn't been called yet.
    6851 ECB             :      */
    6852 CBC        2335 :     if (!RecoveryInProgress())
    6853 GIC        2308 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    6854                 : 
    6855                 :     /* Real work is done; log and update stats. */
    6856 CBC        2335 :     LogCheckpointEnd(false);
    6857 ECB             : 
    6858                 :     /* Reset the process title */
    6859 GIC        2335 :     update_checkpoint_display(flags, false, true);
    6860 ECB             : 
    6861                 :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
    6862                 :                                      NBuffers,
    6863                 :                                      CheckpointStats.ckpt_segs_added,
    6864                 :                                      CheckpointStats.ckpt_segs_removed,
    6865                 :                                      CheckpointStats.ckpt_segs_recycled);
    6866                 : }
    6867                 : 
    6868                 : /*
    6869 EUB             :  * Mark the end of recovery in WAL though without running a full checkpoint.
    6870                 :  * We can expect that a restartpoint is likely to be in progress as we
    6871                 :  * do this, though we are unwilling to wait for it to complete.
    6872                 :  *
    6873 ECB             :  * CreateRestartPoint() allows for the case where recovery may end before
    6874                 :  * the restartpoint completes so there is no concern of concurrent behaviour.
    6875                 :  */
    6876                 : static void
    6877 GIC          36 : CreateEndOfRecoveryRecord(void)
    6878                 : {
    6879                 :     xl_end_of_recovery xlrec;
    6880                 :     XLogRecPtr  recptr;
    6881                 : 
    6882                 :     /* sanity check */
    6883 CBC          36 :     if (!RecoveryInProgress())
    6884 LBC           0 :         elog(ERROR, "can only be used to end recovery");
    6885                 : 
    6886 CBC          36 :     xlrec.end_time = GetCurrentTimestamp();
    6887                 : 
    6888 GIC          36 :     WALInsertLockAcquireExclusive();
    6889              36 :     xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
    6890              36 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    6891 CBC          36 :     WALInsertLockRelease();
    6892 ECB             : 
    6893 CBC          36 :     START_CRIT_SECTION();
    6894                 : 
    6895 GIC          36 :     XLogBeginInsert();
    6896              36 :     XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
    6897 CBC          36 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
    6898                 : 
    6899 GIC          36 :     XLogFlush(recptr);
    6900                 : 
    6901                 :     /*
    6902                 :      * Update the control file so that crash recovery can follow the timeline
    6903                 :      * changes to this point.
    6904                 :      */
    6905              36 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    6906 CBC          36 :     ControlFile->minRecoveryPoint = recptr;
    6907 GIC          36 :     ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
    6908 CBC          36 :     UpdateControlFile();
    6909              36 :     LWLockRelease(ControlFileLock);
    6910                 : 
    6911              36 :     END_CRIT_SECTION();
    6912 GIC          36 : }
    6913                 : 
    6914                 : /*
    6915                 :  * Write an OVERWRITE_CONTRECORD message.
    6916                 :  *
    6917                 :  * When on WAL replay we expect a continuation record at the start of a page
    6918 ECB             :  * that is not there, recovery ends and WAL writing resumes at that point.
    6919 EUB             :  * But it's wrong to resume writing new WAL back at the start of the record
    6920                 :  * that was broken, because downstream consumers of that WAL (physical
    6921                 :  * replicas) are not prepared to "rewind".  So the first action after
    6922                 :  * finishing replay of all valid WAL must be to write a record of this type
    6923                 :  * at the point where the contrecord was missing; to support xlogreader
    6924                 :  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
    6925                 :  * to the page header where the record occurs.  xlogreader has an ad-hoc
    6926 ECB             :  * mechanism to report metadata about the broken record, which is what we
    6927                 :  * use here.
    6928                 :  *
    6929                 :  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
    6930                 :  * skip the record it was reading, and pass back the LSN of the skipped
    6931                 :  * record, so that its caller can verify (on "replay" of that record) that the
    6932                 :  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
    6933                 :  *
    6934                 :  * 'aborted_lsn' is the beginning position of the record that was incomplete.
    6935                 :  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
    6936                 :  * beginning of the XLOG page where the record is to be inserted.  They must
    6937                 :  * match the current WAL insert position, they're passed here just so that we
    6938                 :  * can verify that.
    6939                 :  */
    6940                 : static XLogRecPtr
    6941 GIC           1 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
    6942                 :                                 TimeLineID newTLI)
    6943                 : {
    6944                 :     xl_overwrite_contrecord xlrec;
    6945 ECB             :     XLogRecPtr  recptr;
    6946                 :     XLogPageHeader pagehdr;
    6947                 :     XLogRecPtr  startPos;
    6948                 : 
    6949                 :     /* sanity checks */
    6950 CBC           1 :     if (!RecoveryInProgress())
    6951 UIC           0 :         elog(ERROR, "can only be used at end of recovery");
    6952 GIC           1 :     if (pagePtr % XLOG_BLCKSZ != 0)
    6953 LBC           0 :         elog(ERROR, "invalid position for missing continuation record %X/%X",
    6954 ECB             :              LSN_FORMAT_ARGS(pagePtr));
    6955                 : 
    6956                 :     /* The current WAL insert position should be right after the page header */
    6957 GIC           1 :     startPos = pagePtr;
    6958               1 :     if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
    6959               1 :         startPos += SizeOfXLogLongPHD;
    6960                 :     else
    6961 LBC           0 :         startPos += SizeOfXLogShortPHD;
    6962 GIC           1 :     recptr = GetXLogInsertRecPtr();
    6963               1 :     if (recptr != startPos)
    6964 UIC           0 :         elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
    6965                 :              LSN_FORMAT_ARGS(recptr));
    6966 ECB             : 
    6967 GIC           1 :     START_CRIT_SECTION();
    6968                 : 
    6969                 :     /*
    6970                 :      * Initialize the XLOG page header (by GetXLogBuffer), and set the
    6971                 :      * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
    6972 ECB             :      *
    6973                 :      * No other backend is allowed to write WAL yet, so acquiring the WAL
    6974                 :      * insertion lock is just pro forma.
    6975                 :      */
    6976 GIC           1 :     WALInsertLockAcquire();
    6977               1 :     pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
    6978               1 :     pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
    6979 CBC           1 :     WALInsertLockRelease();
    6980 ECB             : 
    6981                 :     /*
    6982                 :      * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
    6983                 :      * page.  We know it becomes the first record, because no other backend is
    6984                 :      * allowed to write WAL yet.
    6985                 :      */
    6986 GIC           1 :     XLogBeginInsert();
    6987               1 :     xlrec.overwritten_lsn = aborted_lsn;
    6988               1 :     xlrec.overwrite_time = GetCurrentTimestamp();
    6989 CBC           1 :     XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
    6990               1 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
    6991                 : 
    6992 ECB             :     /* check that the record was inserted to the right place */
    6993 CBC           1 :     if (ProcLastRecPtr != startPos)
    6994 UIC           0 :         elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
    6995                 :              LSN_FORMAT_ARGS(ProcLastRecPtr));
    6996                 : 
    6997 GIC           1 :     XLogFlush(recptr);
    6998                 : 
    6999               1 :     END_CRIT_SECTION();
    7000 ECB             : 
    7001 CBC           1 :     return recptr;
    7002                 : }
    7003                 : 
    7004                 : /*
    7005                 :  * Flush all data in shared memory to disk, and fsync
    7006                 :  *
    7007                 :  * This is the common code shared between regular checkpoints and
    7008                 :  * recovery restartpoints.
    7009                 :  */
    7010 ECB             : static void
    7011 CBC        2363 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
    7012                 : {
    7013 GIC        2363 :     CheckPointRelationMap();
    7014 CBC        2363 :     CheckPointReplicationSlots();
    7015 GIC        2363 :     CheckPointSnapBuild();
    7016            2363 :     CheckPointLogicalRewriteHeap();
    7017 CBC        2363 :     CheckPointReplicationOrigin();
    7018                 : 
    7019                 :     /* Write out all dirty data in SLRUs and the main buffer pool */
    7020                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
    7021 GIC        2363 :     CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
    7022            2363 :     CheckPointCLOG();
    7023            2363 :     CheckPointCommitTs();
    7024            2363 :     CheckPointSUBTRANS();
    7025            2363 :     CheckPointMultiXact();
    7026            2363 :     CheckPointPredicate();
    7027            2363 :     CheckPointBuffers(flags);
    7028                 : 
    7029                 :     /* Perform all queued up fsyncs */
    7030                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
    7031            2363 :     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
    7032            2363 :     ProcessSyncRequests();
    7033            2363 :     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
    7034                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
    7035 ECB             : 
    7036                 :     /* We deliberately delay 2PC checkpointing as long as possible */
    7037 GIC        2363 :     CheckPointTwoPhase(checkPointRedo);
    7038            2363 : }
    7039                 : 
    7040                 : /*
    7041 ECB             :  * Save a checkpoint for recovery restart if appropriate
    7042 EUB             :  *
    7043                 :  * This function is called each time a checkpoint record is read from XLOG.
    7044 ECB             :  * It must determine whether the checkpoint represents a safe restartpoint or
    7045                 :  * not.  If so, the checkpoint record is stashed in shared memory so that
    7046                 :  * CreateRestartPoint can consult it.  (Note that the latter function is
    7047                 :  * executed by the checkpointer, while this one will be executed by the
    7048                 :  * startup process.)
    7049                 :  */
    7050                 : static void
    7051 CBC         166 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
    7052                 : {
    7053 ECB             :     /*
    7054                 :      * Also refrain from creating a restartpoint if we have seen any
    7055                 :      * references to non-existent pages. Restarting recovery from the
    7056                 :      * restartpoint would not see the references, so we would lose the
    7057                 :      * cross-check that the pages belonged to a relation that was dropped
    7058                 :      * later.
    7059                 :      */
    7060 GIC         166 :     if (XLogHaveInvalidPages())
    7061                 :     {
    7062 UIC           0 :         elog(trace_recovery(DEBUG2),
    7063 ECB             :              "could not record restart point at %X/%X because there "
    7064                 :              "are unresolved references to invalid pages",
    7065                 :              LSN_FORMAT_ARGS(checkPoint->redo));
    7066 LBC           0 :         return;
    7067 ECB             :     }
    7068                 : 
    7069                 :     /*
    7070                 :      * Copy the checkpoint record to shared memory, so that checkpointer can
    7071                 :      * work out the next time it wants to perform a restartpoint.
    7072                 :      */
    7073 GIC         166 :     SpinLockAcquire(&XLogCtl->info_lck);
    7074             166 :     XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
    7075             166 :     XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
    7076             166 :     XLogCtl->lastCheckPoint = *checkPoint;
    7077             166 :     SpinLockRelease(&XLogCtl->info_lck);
    7078                 : }
    7079                 : 
    7080                 : /*
    7081                 :  * Establish a restartpoint if possible.
    7082                 :  *
    7083                 :  * This is similar to CreateCheckPoint, but is used during WAL recovery
    7084                 :  * to establish a point from which recovery can roll forward without
    7085                 :  * replaying the entire recovery log.
    7086                 :  *
    7087                 :  * Returns true if a new restartpoint was established. We can only establish
    7088                 :  * a restartpoint if we have replayed a safe checkpoint record since last
    7089                 :  * restartpoint.
    7090                 :  */
    7091                 : bool
    7092              71 : CreateRestartPoint(int flags)
    7093                 : {
    7094                 :     XLogRecPtr  lastCheckPointRecPtr;
    7095                 :     XLogRecPtr  lastCheckPointEndPtr;
    7096                 :     CheckPoint  lastCheckPoint;
    7097                 :     XLogRecPtr  PriorRedoPtr;
    7098                 :     XLogRecPtr  receivePtr;
    7099 ECB             :     XLogRecPtr  replayPtr;
    7100                 :     TimeLineID  replayTLI;
    7101                 :     XLogRecPtr  endptr;
    7102                 :     XLogSegNo   _logSegNo;
    7103                 :     TimestampTz xtime;
    7104                 : 
    7105                 :     /* Concurrent checkpoint/restartpoint cannot happen */
    7106 GIC          71 :     Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
    7107                 : 
    7108 ECB             :     /* Get a local copy of the last safe checkpoint record. */
    7109 GBC          71 :     SpinLockAcquire(&XLogCtl->info_lck);
    7110 CBC          71 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
    7111 GBC          71 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
    7112 GIC          71 :     lastCheckPoint = XLogCtl->lastCheckPoint;
    7113              71 :     SpinLockRelease(&XLogCtl->info_lck);
    7114                 : 
    7115 ECB             :     /*
    7116                 :      * Check that we're still in recovery mode. It's ok if we exit recovery
    7117                 :      * mode after this check, the restart point is valid anyway.
    7118                 :      */
    7119 GBC          71 :     if (!RecoveryInProgress())
    7120 ECB             :     {
    7121 LBC           0 :         ereport(DEBUG2,
    7122 EUB             :                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
    7123 UIC           0 :         return false;
    7124                 :     }
    7125 ECB             : 
    7126                 :     /*
    7127                 :      * If the last checkpoint record we've replayed is already our last
    7128                 :      * restartpoint, we can't perform a new restart point. We still update
    7129                 :      * minRecoveryPoint in that case, so that if this is a shutdown restart
    7130                 :      * point, we won't start up earlier than before. That's not strictly
    7131                 :      * necessary, but when hot standby is enabled, it would be rather weird if
    7132                 :      * the database opened up for read-only connections at a point-in-time
    7133                 :      * before the last shutdown. Such time travel is still possible in case of
    7134                 :      * immediate shutdown, though.
    7135                 :      *
    7136                 :      * We don't explicitly advance minRecoveryPoint when we do create a
    7137                 :      * restartpoint. It's assumed that flushing the buffers will do that as a
    7138                 :      * side-effect.
    7139                 :      */
    7140 GIC          71 :     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
    7141              69 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    7142                 :     {
    7143              43 :         ereport(DEBUG2,
    7144 ECB             :                 (errmsg_internal("skipping restartpoint, already performed at %X/%X",
    7145                 :                                  LSN_FORMAT_ARGS(lastCheckPoint.redo))));
    7146                 : 
    7147 CBC          43 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
    7148              43 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
    7149                 :         {
    7150 GIC          18 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7151 CBC          18 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    7152 GBC          18 :             UpdateControlFile();
    7153 GIC          18 :             LWLockRelease(ControlFileLock);
    7154                 :         }
    7155 CBC          43 :         return false;
    7156                 :     }
    7157 ECB             : 
    7158                 :     /*
    7159                 :      * Update the shared RedoRecPtr so that the startup process can calculate
    7160                 :      * the number of segments replayed since last restartpoint, and request a
    7161                 :      * restartpoint if it exceeds CheckPointSegments.
    7162                 :      *
    7163                 :      * Like in CreateCheckPoint(), hold off insertions to update it, although
    7164                 :      * during recovery this is just pro forma, because no WAL insertions are
    7165                 :      * happening.
    7166                 :      */
    7167 GIC          28 :     WALInsertLockAcquireExclusive();
    7168              28 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
    7169 CBC          28 :     WALInsertLockRelease();
    7170                 : 
    7171 ECB             :     /* Also update the info_lck-protected copy */
    7172 CBC          28 :     SpinLockAcquire(&XLogCtl->info_lck);
    7173              28 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
    7174              28 :     SpinLockRelease(&XLogCtl->info_lck);
    7175 ECB             : 
    7176                 :     /*
    7177                 :      * Prepare to accumulate statistics.
    7178                 :      *
    7179                 :      * Note: because it is possible for log_checkpoints to change while a
    7180                 :      * checkpoint proceeds, we always accumulate stats, even if
    7181                 :      * log_checkpoints is currently off.
    7182                 :      */
    7183 CBC         308 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    7184              28 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
    7185 ECB             : 
    7186 GIC          28 :     if (log_checkpoints)
    7187              28 :         LogCheckpointStart(flags, true);
    7188                 : 
    7189 ECB             :     /* Update the process title */
    7190 CBC          28 :     update_checkpoint_display(flags, true, false);
    7191 ECB             : 
    7192 GIC          28 :     CheckPointGuts(lastCheckPoint.redo, flags);
    7193                 : 
    7194                 :     /*
    7195 ECB             :      * Remember the prior checkpoint's redo ptr for
    7196                 :      * UpdateCheckPointDistanceEstimate()
    7197                 :      */
    7198 GIC          28 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
    7199                 : 
    7200                 :     /*
    7201                 :      * Update pg_control, using current time.  Check that it still shows an
    7202                 :      * older checkpoint, else do nothing; this is a quick hack to make sure
    7203                 :      * nothing really bad happens if somehow we get here after the
    7204                 :      * end-of-recovery checkpoint.
    7205                 :      */
    7206              28 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7207              28 :     if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    7208                 :     {
    7209 ECB             :         /*
    7210                 :          * Update the checkpoint information.  We do this even if the cluster
    7211                 :          * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
    7212                 :          * segments recycled below.
    7213                 :          */
    7214 GIC          28 :         ControlFile->checkPoint = lastCheckPointRecPtr;
    7215              28 :         ControlFile->checkPointCopy = lastCheckPoint;
    7216                 : 
    7217                 :         /*
    7218 ECB             :          * Ensure minRecoveryPoint is past the checkpoint record and update it
    7219                 :          * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
    7220 EUB             :          * this will have happened already while writing out dirty buffers,
    7221                 :          * but not necessarily - e.g. because no buffers were dirtied.  We do
    7222                 :          * this because a backup performed in recovery uses minRecoveryPoint
    7223                 :          * to determine which WAL files must be included in the backup, and
    7224                 :          * the file (or files) containing the checkpoint record must be
    7225                 :          * included, at a minimum.  Note that for an ordinary restart of
    7226                 :          * recovery there's no value in having the minimum recovery point any
    7227                 :          * earlier than this anyway, because redo will begin just after the
    7228                 :          * checkpoint record.
    7229                 :          */
    7230 GIC          28 :         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
    7231 ECB             :         {
    7232 CBC          28 :             if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
    7233 ECB             :             {
    7234 CBC           6 :                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
    7235               6 :                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
    7236                 : 
    7237                 :                 /* update local copy */
    7238 GIC           6 :                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    7239               6 :                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    7240                 :             }
    7241              28 :             if (flags & CHECKPOINT_IS_SHUTDOWN)
    7242              13 :                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
    7243                 :         }
    7244              28 :         UpdateControlFile();
    7245                 :     }
    7246              28 :     LWLockRelease(ControlFileLock);
    7247                 : 
    7248                 :     /*
    7249                 :      * Update the average distance between checkpoints/restartpoints if the
    7250 ECB             :      * prior checkpoint exists.
    7251                 :      */
    7252 GIC          28 :     if (PriorRedoPtr != InvalidXLogRecPtr)
    7253              28 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
    7254                 : 
    7255                 :     /*
    7256                 :      * Delete old log files, those no longer needed for last restartpoint to
    7257                 :      * prevent the disk holding the xlog from growing full.
    7258                 :      */
    7259              28 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7260                 : 
    7261                 :     /*
    7262                 :      * Retreat _logSegNo using the current end of xlog replayed or received,
    7263                 :      * whichever is later.
    7264 ECB             :      */
    7265 GIC          28 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
    7266              28 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
    7267 CBC          28 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
    7268              28 :     KeepLogSeg(endptr, &_logSegNo);
    7269 GNC          28 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
    7270                 :                                            _logSegNo, InvalidOid,
    7271                 :                                            InvalidTransactionId))
    7272 ECB             :     {
    7273                 :         /*
    7274                 :          * Some slots have been invalidated; recalculate the old-segment
    7275                 :          * horizon, starting again from RedoRecPtr.
    7276                 :          */
    7277 UIC           0 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
    7278               0 :         KeepLogSeg(endptr, &_logSegNo);
    7279 ECB             :     }
    7280 GIC          28 :     _logSegNo--;
    7281 EUB             : 
    7282                 :     /*
    7283                 :      * Try to recycle segments on a useful timeline. If we've been promoted
    7284                 :      * since the beginning of this restartpoint, use the new timeline chosen
    7285                 :      * at end of recovery.  If we're still in recovery, use the timeline we're
    7286                 :      * currently replaying.
    7287                 :      *
    7288                 :      * There is no guarantee that the WAL segments will be useful on the
    7289                 :      * current timeline; if recovery proceeds to a new timeline right after
    7290                 :      * this, the pre-allocated WAL segments on this timeline will not be used,
    7291                 :      * and will go wasted until recycled on the next restartpoint. We'll live
    7292                 :      * with that.
    7293                 :      */
    7294 GIC          28 :     if (!RecoveryInProgress())
    7295 UIC           0 :         replayTLI = XLogCtl->InsertTimeLineID;
    7296                 : 
    7297 GIC          28 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
    7298                 : 
    7299                 :     /*
    7300 ECB             :      * Make more log segments if needed.  (Do this after recycling old log
    7301                 :      * segments, since that may supply some of the needed files.)
    7302                 :      */
    7303 CBC          28 :     PreallocXlogFiles(endptr, replayTLI);
    7304                 : 
    7305                 :     /*
    7306                 :      * Truncate pg_subtrans if possible.  We can throw away all data before
    7307 ECB             :      * the oldest XMIN of any running transaction.  No future transaction will
    7308                 :      * attempt to reference any pg_subtrans entry older than that (see Asserts
    7309                 :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
    7310                 :      * this because StartupSUBTRANS hasn't been called yet.
    7311                 :      */
    7312 CBC          28 :     if (EnableHotStandby)
    7313              28 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
    7314                 : 
    7315 ECB             :     /* Real work is done; log and update stats. */
    7316 GIC          28 :     LogCheckpointEnd(true);
    7317                 : 
    7318                 :     /* Reset the process title */
    7319              28 :     update_checkpoint_display(flags, true, true);
    7320                 : 
    7321              28 :     xtime = GetLatestXTime();
    7322              28 :     ereport((log_checkpoints ? LOG : DEBUG2),
    7323                 :             (errmsg("recovery restart point at %X/%X",
    7324                 :                     LSN_FORMAT_ARGS(lastCheckPoint.redo)),
    7325                 :              xtime ? errdetail("Last completed transaction was at log time %s.",
    7326                 :                                timestamptz_to_str(xtime)) : 0));
    7327 ECB             : 
    7328                 :     /*
    7329                 :      * Finally, execute archive_cleanup_command, if any.
    7330                 :      */
    7331 GIC          28 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
    7332 LBC           0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
    7333 ECB             :                                "archive_cleanup_command",
    7334                 :                                false,
    7335                 :                                WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
    7336                 : 
    7337 GIC          28 :     return true;
    7338                 : }
    7339                 : 
    7340                 : /*
    7341                 :  * Report availability of WAL for the given target LSN
    7342                 :  *      (typically a slot's restart_lsn)
    7343 ECB             :  *
    7344                 :  * Returns one of the following enum values:
    7345                 :  *
    7346                 :  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
    7347                 :  *   max_wal_size.
    7348                 :  *
    7349                 :  * * WALAVAIL_EXTENDED means it is still available by preserving extra
    7350                 :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
    7351                 :  *   than max_wal_size, this state is not returned.
    7352                 :  *
    7353                 :  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
    7354                 :  *   remove reserved segments. The walsender using this slot may return to the
    7355                 :  *   above.
    7356                 :  *
    7357                 :  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
    7358                 :  *   a slot with this LSN cannot continue.  (Any associated walsender
    7359                 :  *   processes should have been terminated already.)
    7360                 :  *
    7361                 :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
    7362                 :  */
    7363                 : WALAvailability
    7364 GIC         305 : GetWALAvailability(XLogRecPtr targetLSN)
    7365                 : {
    7366                 :     XLogRecPtr  currpos;        /* current write LSN */
    7367 ECB             :     XLogSegNo   currSeg;        /* segid of currpos */
    7368                 :     XLogSegNo   targetSeg;      /* segid of targetLSN */
    7369                 :     XLogSegNo   oldestSeg;      /* actual oldest segid */
    7370                 :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
    7371                 :     XLogSegNo   oldestSlotSeg;  /* oldest segid kept by slot */
    7372                 :     uint64      keepSegs;
    7373                 : 
    7374                 :     /*
    7375                 :      * slot does not reserve WAL. Either deactivated, or has never been active
    7376                 :      */
    7377 GIC         305 :     if (XLogRecPtrIsInvalid(targetLSN))
    7378              11 :         return WALAVAIL_INVALID_LSN;
    7379                 : 
    7380                 :     /*
    7381                 :      * Calculate the oldest segment currently reserved by all slots,
    7382                 :      * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
    7383                 :      * oldestSlotSeg to the current segment.
    7384                 :      */
    7385             294 :     currpos = GetXLogWriteRecPtr();
    7386             294 :     XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
    7387             294 :     KeepLogSeg(currpos, &oldestSlotSeg);
    7388                 : 
    7389                 :     /*
    7390                 :      * Find the oldest extant segment file. We get 1 until checkpoint removes
    7391 ECB             :      * the first WAL segment file since startup, which causes the status being
    7392                 :      * wrong under certain abnormal conditions but that doesn't actually harm.
    7393                 :      */
    7394 GIC         294 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
    7395 ECB             : 
    7396                 :     /* calculate oldest segment by max_wal_size */
    7397 GIC         294 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
    7398             294 :     keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
    7399 ECB             : 
    7400 CBC         294 :     if (currSeg > keepSegs)
    7401 GIC           8 :         oldestSegMaxWalSize = currSeg - keepSegs;
    7402 ECB             :     else
    7403 CBC         286 :         oldestSegMaxWalSize = 1;
    7404                 : 
    7405 ECB             :     /* the segment we care about */
    7406 GIC         294 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
    7407 ECB             : 
    7408                 :     /*
    7409                 :      * No point in returning reserved or extended status values if the
    7410                 :      * targetSeg is known to be lost.
    7411                 :      */
    7412 GIC         294 :     if (targetSeg >= oldestSlotSeg)
    7413 ECB             :     {
    7414                 :         /* show "reserved" when targetSeg is within max_wal_size */
    7415 GIC         293 :         if (targetSeg >= oldestSegMaxWalSize)
    7416             291 :             return WALAVAIL_RESERVED;
    7417                 : 
    7418                 :         /* being retained by slots exceeding max_wal_size */
    7419               2 :         return WALAVAIL_EXTENDED;
    7420 ECB             :     }
    7421                 : 
    7422                 :     /* WAL segments are no longer retained but haven't been removed yet */
    7423 GIC           1 :     if (targetSeg >= oldestSeg)
    7424               1 :         return WALAVAIL_UNRESERVED;
    7425                 : 
    7426 ECB             :     /* Definitely lost */
    7427 LBC           0 :     return WALAVAIL_REMOVED;
    7428 ECB             : }
    7429                 : 
    7430                 : 
    7431                 : /*
    7432                 :  * Retreat *logSegNo to the last segment that we need to retain because of
    7433                 :  * either wal_keep_size or replication slots.
    7434                 :  *
    7435                 :  * This is calculated by subtracting wal_keep_size from the given xlog
    7436                 :  * location, recptr and by making sure that that result is below the
    7437                 :  * requirement of replication slots.  For the latter criterion we do consider
    7438 EUB             :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
    7439                 :  * from recptr.
    7440                 :  *
    7441 ECB             :  * Note about replication slots: if this function calculates a value
    7442                 :  * that's further ahead than what slots need reserved, then affected
    7443                 :  * slots need to be invalidated and this function invoked again.
    7444                 :  * XXX it might be a good idea to rewrite this function so that
    7445                 :  * invalidation is optionally done here, instead.
    7446                 :  */
    7447                 : static void
    7448 GIC        2660 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
    7449                 : {
    7450                 :     XLogSegNo   currSegNo;
    7451                 :     XLogSegNo   segno;
    7452                 :     XLogRecPtr  keep;
    7453                 : 
    7454            2660 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
    7455 CBC        2660 :     segno = currSegNo;
    7456 EUB             : 
    7457                 :     /*
    7458 ECB             :      * Calculate how many segments are kept by slots first, adjusting for
    7459                 :      * max_slot_wal_keep_size.
    7460                 :      */
    7461 GIC        2660 :     keep = XLogGetReplicationSlotMinimumLSN();
    7462            2660 :     if (keep != InvalidXLogRecPtr)
    7463                 :     {
    7464 CBC         384 :         XLByteToSeg(keep, segno, wal_segment_size);
    7465                 : 
    7466                 :         /* Cap by max_slot_wal_keep_size ... */
    7467 GIC         384 :         if (max_slot_wal_keep_size_mb >= 0)
    7468                 :         {
    7469                 :             uint64      slot_keep_segs;
    7470                 : 
    7471              17 :             slot_keep_segs =
    7472              17 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
    7473 ECB             : 
    7474 CBC          17 :             if (currSegNo - segno > slot_keep_segs)
    7475 GIC           4 :                 segno = currSegNo - slot_keep_segs;
    7476                 :         }
    7477 ECB             :     }
    7478                 : 
    7479                 :     /* but, keep at least wal_keep_size if that's set */
    7480 CBC        2660 :     if (wal_keep_size_mb > 0)
    7481                 :     {
    7482 ECB             :         uint64      keep_segs;
    7483                 : 
    7484 GIC          59 :         keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
    7485              59 :         if (currSegNo - segno < keep_segs)
    7486                 :         {
    7487                 :             /* avoid underflow, don't go below 1 */
    7488              59 :             if (currSegNo <= keep_segs)
    7489              57 :                 segno = 1;
    7490                 :             else
    7491               2 :                 segno = currSegNo - keep_segs;
    7492 ECB             :         }
    7493 EUB             :     }
    7494                 : 
    7495                 :     /* don't delete WAL segments newer than the calculated segment */
    7496 GIC        2660 :     if (segno < *logSegNo)
    7497             109 :         *logSegNo = segno;
    7498 CBC        2660 : }
    7499                 : 
    7500                 : /*
    7501                 :  * Write a NEXTOID log record
    7502                 :  */
    7503                 : void
    7504 GIC        1248 : XLogPutNextOid(Oid nextOid)
    7505                 : {
    7506            1248 :     XLogBeginInsert();
    7507            1248 :     XLogRegisterData((char *) (&nextOid), sizeof(Oid));
    7508            1248 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
    7509                 : 
    7510                 :     /*
    7511                 :      * We need not flush the NEXTOID record immediately, because any of the
    7512                 :      * just-allocated OIDs could only reach disk as part of a tuple insert or
    7513                 :      * update that would have its own XLOG record that must follow the NEXTOID
    7514                 :      * record.  Therefore, the standard buffer LSN interlock applied to those
    7515                 :      * records will ensure no such OID reaches disk before the NEXTOID record
    7516                 :      * does.
    7517                 :      *
    7518                 :      * Note, however, that the above statement only covers state "within" the
    7519                 :      * database.  When we use a generated OID as a file or directory name, we
    7520                 :      * are in a sense violating the basic WAL rule, because that filesystem
    7521                 :      * change may reach disk before the NEXTOID WAL record does.  The impact
    7522                 :      * of this is that if a database crash occurs immediately afterward, we
    7523                 :      * might after restart re-generate the same OID and find that it conflicts
    7524                 :      * with the leftover file or directory.  But since for safety's sake we
    7525 ECB             :      * always loop until finding a nonconflicting filename, this poses no real
    7526                 :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
    7527                 :      */
    7528 GIC        1248 : }
    7529                 : 
    7530                 : /*
    7531                 :  * Write an XLOG SWITCH record.
    7532                 :  *
    7533                 :  * Here we just blindly issue an XLogInsert request for the record.
    7534                 :  * All the magic happens inside XLogInsert.
    7535                 :  *
    7536                 :  * The return value is either the end+1 address of the switch record,
    7537                 :  * or the end+1 address of the prior segment if we did not need to
    7538 ECB             :  * write a switch record because we are already at segment start.
    7539                 :  */
    7540                 : XLogRecPtr
    7541 GIC         300 : RequestXLogSwitch(bool mark_unimportant)
    7542                 : {
    7543                 :     XLogRecPtr  RecPtr;
    7544                 : 
    7545                 :     /* XLOG SWITCH has no data */
    7546 CBC         300 :     XLogBeginInsert();
    7547 ECB             : 
    7548 CBC         300 :     if (mark_unimportant)
    7549 UIC           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
    7550 GIC         300 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
    7551                 : 
    7552             300 :     return RecPtr;
    7553                 : }
    7554                 : 
    7555 ECB             : /*
    7556                 :  * Write a RESTORE POINT record
    7557                 :  */
    7558                 : XLogRecPtr
    7559 CBC           3 : XLogRestorePoint(const char *rpName)
    7560                 : {
    7561 ECB             :     XLogRecPtr  RecPtr;
    7562                 :     xl_restore_point xlrec;
    7563                 : 
    7564 CBC           3 :     xlrec.rp_time = GetCurrentTimestamp();
    7565 GIC           3 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
    7566                 : 
    7567 CBC           3 :     XLogBeginInsert();
    7568 GIC           3 :     XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
    7569                 : 
    7570               3 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
    7571                 : 
    7572               3 :     ereport(LOG,
    7573 ECB             :             (errmsg("restore point \"%s\" created at %X/%X",
    7574                 :                     rpName, LSN_FORMAT_ARGS(RecPtr))));
    7575                 : 
    7576 CBC           3 :     return RecPtr;
    7577 ECB             : }
    7578                 : 
    7579                 : /*
    7580                 :  * Check if any of the GUC parameters that are critical for hot standby
    7581                 :  * have changed, and update the value in pg_control file if necessary.
    7582                 :  */
    7583                 : static void
    7584 CBC        1142 : XLogReportParameters(void)
    7585 ECB             : {
    7586 GIC        1142 :     if (wal_level != ControlFile->wal_level ||
    7587             982 :         wal_log_hints != ControlFile->wal_log_hints ||
    7588 GBC         923 :         MaxConnections != ControlFile->MaxConnections ||
    7589 GIC         922 :         max_worker_processes != ControlFile->max_worker_processes ||
    7590             922 :         max_wal_senders != ControlFile->max_wal_senders ||
    7591             918 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
    7592             838 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
    7593             838 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
    7594                 :     {
    7595                 :         /*
    7596                 :          * The change in number of backend slots doesn't need to be WAL-logged
    7597                 :          * if archiving is not enabled, as you can't start archive recovery
    7598                 :          * with wal_level=minimal anyway. We don't really care about the
    7599                 :          * values in pg_control either if wal_level=minimal, but seems better
    7600                 :          * to keep them up-to-date to avoid confusion.
    7601                 :          */
    7602             311 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
    7603                 :         {
    7604                 :             xl_parameter_change xlrec;
    7605                 :             XLogRecPtr  recptr;
    7606                 : 
    7607             309 :             xlrec.MaxConnections = MaxConnections;
    7608             309 :             xlrec.max_worker_processes = max_worker_processes;
    7609 CBC         309 :             xlrec.max_wal_senders = max_wal_senders;
    7610 GIC         309 :             xlrec.max_prepared_xacts = max_prepared_xacts;
    7611             309 :             xlrec.max_locks_per_xact = max_locks_per_xact;
    7612             309 :             xlrec.wal_level = wal_level;
    7613             309 :             xlrec.wal_log_hints = wal_log_hints;
    7614             309 :             xlrec.track_commit_timestamp = track_commit_timestamp;
    7615 ECB             : 
    7616 CBC         309 :             XLogBeginInsert();
    7617 GIC         309 :             XLogRegisterData((char *) &xlrec, sizeof(xlrec));
    7618                 : 
    7619             309 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
    7620             309 :             XLogFlush(recptr);
    7621                 :         }
    7622 ECB             : 
    7623 CBC         311 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7624                 : 
    7625             311 :         ControlFile->MaxConnections = MaxConnections;
    7626 GIC         311 :         ControlFile->max_worker_processes = max_worker_processes;
    7627             311 :         ControlFile->max_wal_senders = max_wal_senders;
    7628 CBC         311 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
    7629 GIC         311 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
    7630             311 :         ControlFile->wal_level = wal_level;
    7631             311 :         ControlFile->wal_log_hints = wal_log_hints;
    7632 CBC         311 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
    7633             311 :         UpdateControlFile();
    7634                 : 
    7635             311 :         LWLockRelease(ControlFileLock);
    7636 ECB             :     }
    7637 GIC        1142 : }
    7638                 : 
    7639                 : /*
    7640                 :  * Update full_page_writes in shared memory, and write an
    7641 ECB             :  * XLOG_FPW_CHANGE record if necessary.
    7642                 :  *
    7643                 :  * Note: this function assumes there is no other process running
    7644                 :  * concurrently that could update it.
    7645                 :  */
    7646                 : void
    7647 GIC        1533 : UpdateFullPageWrites(void)
    7648                 : {
    7649 CBC        1533 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    7650 ECB             :     bool        recoveryInProgress;
    7651                 : 
    7652                 :     /*
    7653                 :      * Do nothing if full_page_writes has not been changed.
    7654                 :      *
    7655                 :      * It's safe to check the shared full_page_writes without the lock,
    7656                 :      * because we assume that there is no concurrently running process which
    7657                 :      * can update it.
    7658                 :      */
    7659 CBC        1533 :     if (fullPageWrites == Insert->fullPageWrites)
    7660 GIC        1264 :         return;
    7661                 : 
    7662                 :     /*
    7663                 :      * Perform this outside critical section so that the WAL insert
    7664                 :      * initialization done by RecoveryInProgress() doesn't trigger an
    7665 ECB             :      * assertion failure.
    7666                 :      */
    7667 CBC         269 :     recoveryInProgress = RecoveryInProgress();
    7668 ECB             : 
    7669 CBC         269 :     START_CRIT_SECTION();
    7670                 : 
    7671                 :     /*
    7672                 :      * It's always safe to take full page images, even when not strictly
    7673                 :      * required, but not the other round. So if we're setting full_page_writes
    7674                 :      * to true, first set it true and then write the WAL record. If we're
    7675                 :      * setting it to false, first write the WAL record and then set the global
    7676                 :      * flag.
    7677                 :      */
    7678 GIC         269 :     if (fullPageWrites)
    7679                 :     {
    7680             267 :         WALInsertLockAcquireExclusive();
    7681             267 :         Insert->fullPageWrites = true;
    7682             267 :         WALInsertLockRelease();
    7683                 :     }
    7684                 : 
    7685                 :     /*
    7686                 :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
    7687                 :      * full_page_writes during archive recovery, if required.
    7688                 :      */
    7689 CBC         269 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
    7690                 :     {
    7691 UIC           0 :         XLogBeginInsert();
    7692               0 :         XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
    7693                 : 
    7694               0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    7695                 :     }
    7696                 : 
    7697 GIC         269 :     if (!fullPageWrites)
    7698                 :     {
    7699               2 :         WALInsertLockAcquireExclusive();
    7700               2 :         Insert->fullPageWrites = false;
    7701               2 :         WALInsertLockRelease();
    7702 ECB             :     }
    7703 GIC         269 :     END_CRIT_SECTION();
    7704                 : }
    7705                 : 
    7706                 : /*
    7707 ECB             :  * XLOG resource manager's routines
    7708                 :  *
    7709                 :  * Definitions of info values are in include/catalog/pg_control.h, though
    7710 EUB             :  * not all record types are related to control file updates.
    7711 ECB             :  *
    7712                 :  * NOTE: Some XLOG record types that are directly related to WAL recovery
    7713                 :  * are handled in xlogrecovery_redo().
    7714                 :  */
    7715                 : void
    7716 GIC       28161 : xlog_redo(XLogReaderState *record)
    7717                 : {
    7718           28161 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    7719           28161 :     XLogRecPtr  lsn = record->EndRecPtr;
    7720 ECB             : 
    7721                 :     /*
    7722                 :      * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
    7723                 :      * XLOG_FPI_FOR_HINT records.
    7724                 :      */
    7725 CBC       28161 :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
    7726 ECB             :            !XLogRecHasAnyBlockRefs(record));
    7727                 : 
    7728 CBC       28161 :     if (info == XLOG_NEXTOID)
    7729 ECB             :     {
    7730                 :         Oid         nextOid;
    7731                 : 
    7732                 :         /*
    7733                 :          * We used to try to take the maximum of ShmemVariableCache->nextOid
    7734                 :          * and the recorded nextOid, but that fails if the OID counter wraps
    7735                 :          * around.  Since no OID allocation should be happening during replay
    7736                 :          * anyway, better to just believe the record exactly.  We still take
    7737                 :          * OidGenLock while setting the variable, just in case.
    7738                 :          */
    7739 GIC          72 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
    7740              72 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    7741              72 :         ShmemVariableCache->nextOid = nextOid;
    7742              72 :         ShmemVariableCache->oidCount = 0;
    7743              72 :         LWLockRelease(OidGenLock);
    7744                 :     }
    7745 CBC       28089 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    7746                 :     {
    7747 ECB             :         CheckPoint  checkPoint;
    7748                 :         TimeLineID  replayTLI;
    7749                 : 
    7750 CBC          25 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    7751 ECB             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
    7752 CBC          25 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    7753              25 :         ShmemVariableCache->nextXid = checkPoint.nextXid;
    7754              25 :         LWLockRelease(XidGenLock);
    7755 GIC          25 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
    7756              25 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
    7757              25 :         ShmemVariableCache->oidCount = 0;
    7758              25 :         LWLockRelease(OidGenLock);
    7759              25 :         MultiXactSetNextMXact(checkPoint.nextMulti,
    7760                 :                               checkPoint.nextMultiOffset);
    7761                 : 
    7762              25 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    7763 ECB             :                                checkPoint.oldestMultiDB);
    7764                 : 
    7765                 :         /*
    7766                 :          * No need to set oldestClogXid here as well; it'll be set when we
    7767                 :          * redo an xl_clog_truncate if it changed since initialization.
    7768                 :          */
    7769 CBC          25 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    7770 ECB             : 
    7771                 :         /*
    7772                 :          * If we see a shutdown checkpoint while waiting for an end-of-backup
    7773                 :          * record, the backup was canceled and the end-of-backup record will
    7774                 :          * never arrive.
    7775                 :          */
    7776 GIC          25 :         if (ArchiveRecoveryRequested &&
    7777 CBC          25 :             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
    7778 LBC           0 :             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
    7779 UIC           0 :             ereport(PANIC,
    7780 ECB             :                     (errmsg("online backup was canceled, recovery cannot continue")));
    7781                 : 
    7782                 :         /*
    7783                 :          * If we see a shutdown checkpoint, we know that nothing was running
    7784                 :          * on the primary at this point. So fake-up an empty running-xacts
    7785                 :          * record and use that here and now. Recover additional standby state
    7786                 :          * for prepared transactions.
    7787                 :          */
    7788 CBC          25 :         if (standbyState >= STANDBY_INITIALIZED)
    7789 ECB             :         {
    7790                 :             TransactionId *xids;
    7791                 :             int         nxids;
    7792                 :             TransactionId oldestActiveXID;
    7793                 :             TransactionId latestCompletedXid;
    7794                 :             RunningTransactionsData running;
    7795                 : 
    7796 CBC          23 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
    7797                 : 
    7798 ECB             :             /*
    7799                 :              * Construct a RunningTransactions snapshot representing a shut
    7800                 :              * down server, with only prepared transactions still alive. We're
    7801                 :              * never overflowed at this point because all subxids are listed
    7802                 :              * with their parent prepared transactions.
    7803                 :              */
    7804 GIC          23 :             running.xcnt = nxids;
    7805              23 :             running.subxcnt = 0;
    7806              23 :             running.subxid_overflow = false;
    7807              23 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
    7808 CBC          23 :             running.oldestRunningXid = oldestActiveXID;
    7809 GIC          23 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
    7810 CBC          23 :             TransactionIdRetreat(latestCompletedXid);
    7811 GIC          23 :             Assert(TransactionIdIsNormal(latestCompletedXid));
    7812              23 :             running.latestCompletedXid = latestCompletedXid;
    7813              23 :             running.xids = xids;
    7814                 : 
    7815              23 :             ProcArrayApplyRecoveryInfo(&running);
    7816                 : 
    7817              23 :             StandbyRecoverPreparedTransactions();
    7818                 :         }
    7819                 : 
    7820 ECB             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    7821 CBC          25 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7822 GIC          25 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    7823              25 :         LWLockRelease(ControlFileLock);
    7824                 : 
    7825                 :         /* Update shared-memory copy of checkpoint XID/epoch */
    7826              25 :         SpinLockAcquire(&XLogCtl->info_lck);
    7827              25 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
    7828 CBC          25 :         SpinLockRelease(&XLogCtl->info_lck);
    7829                 : 
    7830 ECB             :         /*
    7831                 :          * We should've already switched to the new TLI before replaying this
    7832                 :          * record.
    7833                 :          */
    7834 GIC          25 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    7835              25 :         if (checkPoint.ThisTimeLineID != replayTLI)
    7836 UIC           0 :             ereport(PANIC,
    7837                 :                     (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
    7838                 :                             checkPoint.ThisTimeLineID, replayTLI)));
    7839 ECB             : 
    7840 GIC          25 :         RecoveryRestartPoint(&checkPoint, record);
    7841 ECB             :     }
    7842 CBC       28064 :     else if (info == XLOG_CHECKPOINT_ONLINE)
    7843 ECB             :     {
    7844                 :         CheckPoint  checkPoint;
    7845                 :         TimeLineID  replayTLI;
    7846                 : 
    7847 GIC         141 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
    7848                 :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
    7849             141 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
    7850 CBC         141 :         if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
    7851                 :                                       checkPoint.nextXid))
    7852 UBC           0 :             ShmemVariableCache->nextXid = checkPoint.nextXid;
    7853 GBC         141 :         LWLockRelease(XidGenLock);
    7854                 : 
    7855 EUB             :         /*
    7856                 :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
    7857                 :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
    7858 ECB             :          * counter is from the start of the checkpoint and might well be stale
    7859                 :          * compared to later XLOG_NEXTOID records.  We could try to take the
    7860                 :          * maximum of the nextOid counter and our latest value, but since
    7861                 :          * there's no particular guarantee about the speed with which the OID
    7862                 :          * counter wraps around, that's a risky thing to do.  In any case,
    7863                 :          * users of the nextOid counter are required to avoid assignment of
    7864                 :          * duplicates, so that a somewhat out-of-date value should be safe.
    7865                 :          */
    7866                 : 
    7867                 :         /* Handle multixact */
    7868 GIC         141 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
    7869                 :                                   checkPoint.nextMultiOffset);
    7870                 : 
    7871                 :         /*
    7872                 :          * NB: This may perform multixact truncation when replaying WAL
    7873                 :          * generated by an older primary.
    7874                 :          */
    7875             141 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
    7876                 :                                checkPoint.oldestMultiDB);
    7877 CBC         141 :         if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
    7878                 :                                   checkPoint.oldestXid))
    7879 LBC           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
    7880 ECB             :                                   checkPoint.oldestXidDB);
    7881                 :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
    7882 GIC         141 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    7883             141 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
    7884             141 :         LWLockRelease(ControlFileLock);
    7885                 : 
    7886 ECB             :         /* Update shared-memory copy of checkpoint XID/epoch */
    7887 GIC         141 :         SpinLockAcquire(&XLogCtl->info_lck);
    7888             141 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
    7889 CBC         141 :         SpinLockRelease(&XLogCtl->info_lck);
    7890                 : 
    7891                 :         /* TLI should not change in an on-line checkpoint */
    7892 GIC         141 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    7893             141 :         if (checkPoint.ThisTimeLineID != replayTLI)
    7894 UIC           0 :             ereport(PANIC,
    7895                 :                     (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
    7896                 :                             checkPoint.ThisTimeLineID, replayTLI)));
    7897                 : 
    7898 GIC         141 :         RecoveryRestartPoint(&checkPoint, record);
    7899                 :     }
    7900 CBC       27923 :     else if (info == XLOG_OVERWRITE_CONTRECORD)
    7901 ECB             :     {
    7902                 :         /* nothing to do here, handled in xlogrecovery_redo() */
    7903                 :     }
    7904 CBC       27922 :     else if (info == XLOG_END_OF_RECOVERY)
    7905                 :     {
    7906 ECB             :         xl_end_of_recovery xlrec;
    7907                 :         TimeLineID  replayTLI;
    7908                 : 
    7909 GIC           8 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
    7910                 : 
    7911 ECB             :         /*
    7912                 :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
    7913                 :          * but this case is rarer and harder to test, so the benefit doesn't
    7914                 :          * outweigh the potential extra cost of maintenance.
    7915                 :          */
    7916                 : 
    7917                 :         /*
    7918                 :          * We should've already switched to the new TLI before replaying this
    7919                 :          * record.
    7920                 :          */
    7921 GIC           8 :         (void) GetCurrentReplayRecPtr(&replayTLI);
    7922               8 :         if (xlrec.ThisTimeLineID != replayTLI)
    7923 LBC           0 :             ereport(PANIC,
    7924                 :                     (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
    7925                 :                             xlrec.ThisTimeLineID, replayTLI)));
    7926                 :     }
    7927 GIC       27914 :     else if (info == XLOG_NOOP)
    7928                 :     {
    7929                 :         /* nothing to do here */
    7930 ECB             :     }
    7931 GIC       27914 :     else if (info == XLOG_SWITCH)
    7932                 :     {
    7933                 :         /* nothing to do here */
    7934                 :     }
    7935           27820 :     else if (info == XLOG_RESTORE_POINT)
    7936                 :     {
    7937 ECB             :         /* nothing to do here, handled in xlogrecovery.c */
    7938                 :     }
    7939 GBC       27815 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
    7940 EUB             :     {
    7941                 :         /*
    7942                 :          * XLOG_FPI records contain nothing else but one or more block
    7943                 :          * references. Every block reference must include a full-page image
    7944                 :          * even if full_page_writes was disabled when the record was generated
    7945                 :          * - otherwise there would be no point in this record.
    7946                 :          *
    7947                 :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
    7948                 :          * WAL-logged because of a hint bit update. They are only generated
    7949 ECB             :          * when checksums and/or wal_log_hints are enabled. They may include
    7950                 :          * no full-page images if full_page_writes was disabled when they were
    7951                 :          * generated. In this case there is nothing to do here.
    7952                 :          *
    7953                 :          * No recovery conflicts are generated by these generic records - if a
    7954                 :          * resource manager needs to generate conflicts, it has to define a
    7955                 :          * separate WAL record type and redo routine.
    7956                 :          */
    7957 CBC       57791 :         for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
    7958                 :         {
    7959                 :             Buffer      buffer;
    7960                 : 
    7961 GIC       30056 :             if (!XLogRecHasBlockImage(record, block_id))
    7962                 :             {
    7963              81 :                 if (info == XLOG_FPI)
    7964 UIC           0 :                     elog(ERROR, "XLOG_FPI record did not contain a full-page image");
    7965 CBC          81 :                 continue;
    7966 ECB             :             }
    7967                 : 
    7968 CBC       29975 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
    7969 LBC           0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
    7970 CBC       29975 :             UnlockReleaseBuffer(buffer);
    7971 ECB             :         }
    7972                 :     }
    7973 CBC          80 :     else if (info == XLOG_BACKUP_END)
    7974 ECB             :     {
    7975                 :         /* nothing to do here, handled in xlogrecovery_redo() */
    7976                 :     }
    7977 GIC          19 :     else if (info == XLOG_PARAMETER_CHANGE)
    7978 ECB             :     {
    7979                 :         xl_parameter_change xlrec;
    7980                 : 
    7981                 :         /* Update our copy of the parameters in pg_control */
    7982 CBC          19 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
    7983 ECB             : 
    7984                 :         /*
    7985                 :          * Invalidate logical slots if we are in hot standby and the primary
    7986                 :          * does not have a WAL level sufficient for logical decoding. No need
    7987                 :          * to search for potentially conflicting logically slots if standby is
    7988                 :          * running with wal_level lower than logical, because in that case, we
    7989                 :          * would have either disallowed creation of logical slots or
    7990                 :          * invalidated existing ones.
    7991                 :          */
    7992 GNC          19 :         if (InRecovery && InHotStandby &&
    7993               4 :             xlrec.wal_level < WAL_LEVEL_LOGICAL &&
    7994               3 :             wal_level >= WAL_LEVEL_LOGICAL)
    7995               1 :             InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
    7996                 :                                                0, InvalidOid,
    7997                 :                                                InvalidTransactionId);
    7998                 : 
    7999 CBC          19 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8000 GIC          19 :         ControlFile->MaxConnections = xlrec.MaxConnections;
    8001              19 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
    8002 CBC          19 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
    8003              19 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
    8004              19 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
    8005 GIC          19 :         ControlFile->wal_level = xlrec.wal_level;
    8006              19 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
    8007                 : 
    8008                 :         /*
    8009                 :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
    8010 ECB             :          * recover back up to this point before allowing hot standby again.
    8011                 :          * This is important if the max_* settings are decreased, to ensure
    8012 EUB             :          * you don't run queries against the WAL preceding the change. The
    8013                 :          * local copies cannot be updated as long as crash recovery is
    8014                 :          * happening and we expect all the WAL to be replayed.
    8015                 :          */
    8016 CBC          19 :         if (InArchiveRecovery)
    8017                 :         {
    8018               5 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
    8019 GIC           5 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
    8020                 :         }
    8021              19 :         if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
    8022                 :         {
    8023 ECB             :             TimeLineID  replayTLI;
    8024                 : 
    8025 CBC           5 :             (void) GetCurrentReplayRecPtr(&replayTLI);
    8026               5 :             ControlFile->minRecoveryPoint = lsn;
    8027 GIC           5 :             ControlFile->minRecoveryPointTLI = replayTLI;
    8028 EUB             :         }
    8029 ECB             : 
    8030 GIC          19 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
    8031              19 :                                 ControlFile->track_commit_timestamp);
    8032              19 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
    8033                 : 
    8034              19 :         UpdateControlFile();
    8035              19 :         LWLockRelease(ControlFileLock);
    8036                 : 
    8037                 :         /* Check to see if any parameter change gives a problem on recovery */
    8038              19 :         CheckRequiredParameterValues();
    8039                 :     }
    8040 UIC           0 :     else if (info == XLOG_FPW_CHANGE)
    8041                 :     {
    8042                 :         bool        fpw;
    8043                 : 
    8044 LBC           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
    8045                 : 
    8046                 :         /*
    8047                 :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
    8048                 :          * do_pg_backup_start() and do_pg_backup_stop() can check whether
    8049                 :          * full_page_writes has been disabled during online backup.
    8050                 :          */
    8051               0 :         if (!fpw)
    8052                 :         {
    8053               0 :             SpinLockAcquire(&XLogCtl->info_lck);
    8054 UIC           0 :             if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
    8055 UBC           0 :                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
    8056 UIC           0 :             SpinLockRelease(&XLogCtl->info_lck);
    8057                 :         }
    8058 ECB             : 
    8059                 :         /* Keep track of full_page_writes */
    8060 LBC           0 :         lastFullPageWrites = fpw;
    8061                 :     }
    8062 GIC       28159 : }
    8063 ECB             : 
    8064                 : /*
    8065                 :  * Return the extra open flags used for opening a file, depending on the
    8066                 :  * value of the GUCs wal_sync_method, fsync and io_direct.
    8067                 :  */
    8068                 : static int
    8069 CBC        8348 : get_sync_bit(int method)
    8070 EUB             : {
    8071 GIC        8348 :     int         o_direct_flag = 0;
    8072                 : 
    8073                 :     /*
    8074                 :      * Use O_DIRECT if requested, except in walreceiver process.  The WAL
    8075                 :      * written by walreceiver is normally read by the startup process soon
    8076                 :      * after it's written.  Also, walreceiver performs unaligned writes, which
    8077                 :      * don't work with O_DIRECT, so it is required for correctness too.
    8078                 :      */
    8079 GNC        8348 :     if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
    8080 GIC           7 :         o_direct_flag = PG_O_DIRECT;
    8081                 : 
    8082                 :     /* If fsync is disabled, never open in sync mode */
    8083 GNC        8348 :     if (!enableFsync)
    8084            8348 :         return o_direct_flag;
    8085                 : 
    8086 UIC           0 :     switch (method)
    8087                 :     {
    8088                 :             /*
    8089 ECB             :              * enum values for all sync options are defined even if they are
    8090                 :              * not supported on the current platform.  But if not, they are
    8091 EUB             :              * not included in the enum option array, and therefore will never
    8092                 :              * be seen here.
    8093                 :              */
    8094 UIC           0 :         case SYNC_METHOD_FSYNC:
    8095 ECB             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
    8096                 :         case SYNC_METHOD_FDATASYNC:
    8097 UNC           0 :             return o_direct_flag;
    8098                 : #ifdef O_SYNC
    8099 LBC           0 :         case SYNC_METHOD_OPEN:
    8100 UNC           0 :             return O_SYNC | o_direct_flag;
    8101                 : #endif
    8102                 : #ifdef O_DSYNC
    8103 LBC           0 :         case SYNC_METHOD_OPEN_DSYNC:
    8104 UNC           0 :             return O_DSYNC | o_direct_flag;
    8105                 : #endif
    8106 UIC           0 :         default:
    8107 ECB             :             /* can't happen (unless we are out of sync with option array) */
    8108 UIC           0 :             elog(ERROR, "unrecognized wal_sync_method: %d", method);
    8109                 :             return 0;           /* silence warning */
    8110                 :     }
    8111                 : }
    8112                 : 
    8113                 : /*
    8114                 :  * GUC support
    8115                 :  */
    8116                 : void
    8117 GIC        1857 : assign_xlog_sync_method(int new_sync_method, void *extra)
    8118                 : {
    8119            1857 :     if (sync_method != new_sync_method)
    8120                 :     {
    8121                 :         /*
    8122                 :          * To ensure that no blocks escape unsynced, force an fsync on the
    8123                 :          * currently open log segment (if any).  Also, if the open flag is
    8124                 :          * changing, close the log file so it will be reopened (with new flag
    8125 ECB             :          * bit) at next use.
    8126                 :          */
    8127 UIC           0 :         if (openLogFile >= 0)
    8128                 :         {
    8129 LBC           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
    8130 UIC           0 :             if (pg_fsync(openLogFile) != 0)
    8131 ECB             :             {
    8132 EUB             :                 char        xlogfname[MAXFNAMELEN];
    8133 ECB             :                 int         save_errno;
    8134                 : 
    8135 UIC           0 :                 save_errno = errno;
    8136 LBC           0 :                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
    8137 EUB             :                              wal_segment_size);
    8138 LBC           0 :                 errno = save_errno;
    8139 UIC           0 :                 ereport(PANIC,
    8140                 :                         (errcode_for_file_access(),
    8141 ECB             :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
    8142                 :             }
    8143                 : 
    8144 UIC           0 :             pgstat_report_wait_end();
    8145 LBC           0 :             if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
    8146 UIC           0 :                 XLogFileClose();
    8147                 :         }
    8148                 :     }
    8149 GIC        1857 : }
    8150 ECB             : 
    8151                 : 
    8152                 : /*
    8153                 :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
    8154                 :  *
    8155                 :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
    8156                 :  * 'segno' is for error reporting purposes.
    8157                 :  */
    8158                 : void
    8159 GIC      315883 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
    8160 ECB             : {
    8161 CBC      315883 :     char       *msg = NULL;
    8162 ECB             :     instr_time  start;
    8163                 : 
    8164 GIC      315883 :     Assert(tli != 0);
    8165                 : 
    8166                 :     /*
    8167 ECB             :      * Quick exit if fsync is disabled or write() has already synced the WAL
    8168                 :      * file.
    8169                 :      */
    8170 CBC      315883 :     if (!enableFsync ||
    8171 LBC           0 :         sync_method == SYNC_METHOD_OPEN ||
    8172               0 :         sync_method == SYNC_METHOD_OPEN_DSYNC)
    8173 CBC      315883 :         return;
    8174 ECB             : 
    8175                 :     /* Measure I/O timing to sync the WAL file */
    8176 UIC           0 :     if (track_wal_io_timing)
    8177               0 :         INSTR_TIME_SET_CURRENT(start);
    8178                 :     else
    8179 UNC           0 :         INSTR_TIME_SET_ZERO(start);
    8180                 : 
    8181 UIC           0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
    8182               0 :     switch (sync_method)
    8183                 :     {
    8184               0 :         case SYNC_METHOD_FSYNC:
    8185               0 :             if (pg_fsync_no_writethrough(fd) != 0)
    8186 LBC           0 :                 msg = _("could not fsync file \"%s\": %m");
    8187 UIC           0 :             break;
    8188 ECB             : #ifdef HAVE_FSYNC_WRITETHROUGH
    8189                 :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
    8190                 :             if (pg_fsync_writethrough(fd) != 0)
    8191                 :                 msg = _("could not fsync write-through file \"%s\": %m");
    8192                 :             break;
    8193                 : #endif
    8194 LBC           0 :         case SYNC_METHOD_FDATASYNC:
    8195               0 :             if (pg_fdatasync(fd) != 0)
    8196               0 :                 msg = _("could not fdatasync file \"%s\": %m");
    8197 UIC           0 :             break;
    8198 LBC           0 :         case SYNC_METHOD_OPEN:
    8199 ECB             :         case SYNC_METHOD_OPEN_DSYNC:
    8200                 :             /* not reachable */
    8201 UIC           0 :             Assert(false);
    8202 ECB             :             break;
    8203 LBC           0 :         default:
    8204 UIC           0 :             elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
    8205                 :             break;
    8206 ECB             :     }
    8207                 : 
    8208 EUB             :     /* PANIC if failed to fsync */
    8209 UIC           0 :     if (msg)
    8210                 :     {
    8211                 :         char        xlogfname[MAXFNAMELEN];
    8212 UBC           0 :         int         save_errno = errno;
    8213                 : 
    8214 UIC           0 :         XLogFileName(xlogfname, tli, segno, wal_segment_size);
    8215               0 :         errno = save_errno;
    8216               0 :         ereport(PANIC,
    8217                 :                 (errcode_for_file_access(),
    8218                 :                  errmsg(msg, xlogfname)));
    8219 EUB             :     }
    8220                 : 
    8221 UBC           0 :     pgstat_report_wait_end();
    8222 EUB             : 
    8223                 :     /*
    8224                 :      * Increment the I/O timing and the number of times WAL files were synced.
    8225                 :      */
    8226 UIC           0 :     if (track_wal_io_timing)
    8227                 :     {
    8228 EUB             :         instr_time  duration;
    8229                 : 
    8230 LBC           0 :         INSTR_TIME_SET_CURRENT(duration);
    8231 UNC           0 :         INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, duration, start);
    8232                 :     }
    8233                 : 
    8234 UIC           0 :     PendingWalStats.wal_sync++;
    8235                 : }
    8236 ECB             : 
    8237                 : /*
    8238                 :  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
    8239                 :  * function. It creates the necessary starting checkpoint and constructs the
    8240                 :  * backup state and tablespace map.
    8241                 :  *
    8242                 :  * Input parameters are "state" (the backup state), "fast" (if true, we do
    8243                 :  * the checkpoint in immediate mode to make it faster), and "tablespaces"
    8244                 :  * (if non-NULL, indicates a list of tablespaceinfo structs describing the
    8245                 :  * cluster's tablespaces.).
    8246                 :  *
    8247                 :  * The tablespace map contents are appended to passed-in parameter
    8248                 :  * tablespace_map and the caller is responsible for including it in the backup
    8249                 :  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
    8250                 :  * tar format in windows as native windows utilities are not able to create
    8251                 :  * symlinks while extracting files from tar. However for consistency and
    8252                 :  * platform-independence, we do it the same way everywhere.
    8253                 :  *
    8254                 :  * It fills in "state" with the information required for the backup, such
    8255                 :  * as the minimum WAL location that must be present to restore from this
    8256                 :  * backup (starttli) and the corresponding timeline ID (starttli).
    8257                 :  *
    8258                 :  * Every successfully started backup must be stopped by calling
    8259                 :  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
    8260 EUB             :  * backups active at the same time.
    8261                 :  *
    8262                 :  * It is the responsibility of the caller of this function to verify the
    8263                 :  * permissions of the calling user!
    8264                 :  */
    8265                 : void
    8266 GNC         130 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
    8267                 :                    BackupState *state, StringInfo tblspcmapfile)
    8268 EUB             : {
    8269                 :     bool        backup_started_in_recovery;
    8270                 : 
    8271 GNC         130 :     Assert(state != NULL);
    8272 GIC         130 :     backup_started_in_recovery = RecoveryInProgress();
    8273                 : 
    8274                 :     /*
    8275                 :      * During recovery, we don't need to check WAL level. Because, if WAL
    8276 ECB             :      * level is not sufficient, it's impossible to get here during recovery.
    8277                 :      */
    8278 CBC         130 :     if (!backup_started_in_recovery && !XLogIsNeeded())
    8279 UIC           0 :         ereport(ERROR,
    8280                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8281                 :                  errmsg("WAL level not sufficient for making an online backup"),
    8282                 :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
    8283                 : 
    8284 GIC         130 :     if (strlen(backupidstr) > MAXPGPATH)
    8285               1 :         ereport(ERROR,
    8286 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    8287                 :                  errmsg("backup label too long (max %d bytes)",
    8288                 :                         MAXPGPATH)));
    8289                 : 
    8290 GNC         129 :     memcpy(state->name, backupidstr, strlen(backupidstr));
    8291                 : 
    8292                 :     /*
    8293                 :      * Mark backup active in shared memory.  We must do full-page WAL writes
    8294                 :      * during an on-line backup even if not doing so at other times, because
    8295                 :      * it's quite possible for the backup dump to obtain a "torn" (partially
    8296 EUB             :      * written) copy of a database page if it reads the page concurrently with
    8297                 :      * our write to the same page.  This can be fixed as long as the first
    8298                 :      * write to the page in the WAL sequence is a full-page write. Hence, we
    8299                 :      * increment runningBackups then force a CHECKPOINT, to ensure there are
    8300                 :      * no dirty pages in shared memory that might get dumped while the backup
    8301                 :      * is in progress without having a corresponding WAL record.  (Once the
    8302                 :      * backup is complete, we need not force full-page writes anymore, since
    8303                 :      * we expect that any pages not modified during the backup interval must
    8304                 :      * have been correctly captured by the backup.)
    8305                 :      *
    8306                 :      * Note that forcing full-page writes has no effect during an online
    8307                 :      * backup from the standby.
    8308                 :      *
    8309                 :      * We must hold all the insertion locks to change the value of
    8310                 :      * runningBackups, to ensure adequate interlocking against
    8311                 :      * XLogInsertRecord().
    8312                 :      */
    8313 GIC         129 :     WALInsertLockAcquireExclusive();
    8314             129 :     XLogCtl->Insert.runningBackups++;
    8315             129 :     WALInsertLockRelease();
    8316                 : 
    8317                 :     /*
    8318                 :      * Ensure we decrement runningBackups if we fail below. NB -- for this to
    8319                 :      * work correctly, it is critical that sessionBackupState is only updated
    8320                 :      * after this block is over.
    8321                 :      */
    8322 GNC         129 :     PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
    8323 ECB             :     {
    8324 GIC         129 :         bool        gotUniqueStartpoint = false;
    8325 ECB             :         DIR        *tblspcdir;
    8326                 :         struct dirent *de;
    8327                 :         tablespaceinfo *ti;
    8328                 :         int         datadirpathlen;
    8329                 : 
    8330                 :         /*
    8331                 :          * Force an XLOG file switch before the checkpoint, to ensure that the
    8332                 :          * WAL segment the checkpoint is written to doesn't contain pages with
    8333                 :          * old timeline IDs.  That would otherwise happen if you called
    8334                 :          * pg_backup_start() right after restoring from a PITR archive: the
    8335 EUB             :          * first WAL segment containing the startup checkpoint has pages in
    8336                 :          * the beginning with the old timeline ID.  That can cause trouble at
    8337 ECB             :          * recovery: we won't have a history file covering the old timeline if
    8338                 :          * pg_wal directory was not included in the base backup and the WAL
    8339                 :          * archive was cleared too before starting the backup.
    8340 EUB             :          *
    8341                 :          * This also ensures that we have emitted a WAL page header that has
    8342                 :          * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
    8343                 :          * Therefore, if a WAL archiver (such as pglesslog) is trying to
    8344                 :          * compress out removable backup blocks, it won't remove any that
    8345                 :          * occur after this point.
    8346                 :          *
    8347                 :          * During recovery, we skip forcing XLOG file switch, which means that
    8348                 :          * the backup taken during recovery is not available for the special
    8349                 :          * recovery case described above.
    8350                 :          */
    8351 GBC         129 :         if (!backup_started_in_recovery)
    8352 GIC         124 :             RequestXLogSwitch(false);
    8353                 : 
    8354                 :         do
    8355                 :         {
    8356                 :             bool        checkpointfpw;
    8357                 : 
    8358 EUB             :             /*
    8359                 :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
    8360                 :              * page problems, this guarantees that two successive backup runs
    8361                 :              * will have different checkpoint positions and hence different
    8362                 :              * history file names, even if nothing happened in between.
    8363                 :              *
    8364                 :              * During recovery, establish a restartpoint if possible. We use
    8365                 :              * the last restartpoint as the backup starting checkpoint. This
    8366                 :              * means that two successive backup runs can have same checkpoint
    8367                 :              * positions.
    8368                 :              *
    8369                 :              * Since the fact that we are executing do_pg_backup_start()
    8370                 :              * during recovery means that checkpointer is running, we can use
    8371                 :              * RequestCheckpoint() to establish a restartpoint.
    8372                 :              *
    8373                 :              * We use CHECKPOINT_IMMEDIATE only if requested by user (via
    8374                 :              * passing fast = true).  Otherwise this can take awhile.
    8375                 :              */
    8376 GBC         129 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
    8377                 :                               (fast ? CHECKPOINT_IMMEDIATE : 0));
    8378 EUB             : 
    8379                 :             /*
    8380                 :              * Now we need to fetch the checkpoint record location, and also
    8381                 :              * its REDO pointer.  The oldest point in WAL that would be needed
    8382                 :              * to restore starting from the checkpoint is precisely the REDO
    8383                 :              * pointer.
    8384                 :              */
    8385 GBC         129 :             LWLockAcquire(ControlFileLock, LW_SHARED);
    8386 GNC         129 :             state->checkpointloc = ControlFile->checkPoint;
    8387             129 :             state->startpoint = ControlFile->checkPointCopy.redo;
    8388             129 :             state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
    8389 GIC         129 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
    8390 GBC         129 :             LWLockRelease(ControlFileLock);
    8391                 : 
    8392 GIC         129 :             if (backup_started_in_recovery)
    8393                 :             {
    8394 EUB             :                 XLogRecPtr  recptr;
    8395                 : 
    8396                 :                 /*
    8397                 :                  * Check to see if all WAL replayed during online backup
    8398                 :                  * (i.e., since last restartpoint used as backup starting
    8399                 :                  * checkpoint) contain full-page writes.
    8400                 :                  */
    8401 GIC           5 :                 SpinLockAcquire(&XLogCtl->info_lck);
    8402               5 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
    8403               5 :                 SpinLockRelease(&XLogCtl->info_lck);
    8404                 : 
    8405 GNC           5 :                 if (!checkpointfpw || state->startpoint <= recptr)
    8406 UIC           0 :                     ereport(ERROR,
    8407                 :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8408                 :                              errmsg("WAL generated with full_page_writes=off was replayed "
    8409                 :                                     "since last restartpoint"),
    8410                 :                              errhint("This means that the backup being taken on the standby "
    8411                 :                                      "is corrupt and should not be used. "
    8412                 :                                      "Enable full_page_writes and run CHECKPOINT on the primary, "
    8413                 :                                      "and then try an online backup again.")));
    8414                 : 
    8415                 :                 /*
    8416                 :                  * During recovery, since we don't use the end-of-backup WAL
    8417                 :                  * record and don't write the backup history file, the
    8418                 :                  * starting WAL location doesn't need to be unique. This means
    8419                 :                  * that two base backups started at the same time might use
    8420                 :                  * the same checkpoint as starting locations.
    8421                 :                  */
    8422 GIC           5 :                 gotUniqueStartpoint = true;
    8423                 :             }
    8424                 : 
    8425                 :             /*
    8426                 :              * If two base backups are started at the same time (in WAL sender
    8427                 :              * processes), we need to make sure that they use different
    8428                 :              * checkpoints as starting locations, because we use the starting
    8429                 :              * WAL location as a unique identifier for the base backup in the
    8430 ECB             :              * end-of-backup WAL record and when we write the backup history
    8431                 :              * file. Perhaps it would be better generate a separate unique ID
    8432                 :              * for each backup instead of forcing another checkpoint, but
    8433                 :              * taking a checkpoint right after another is not that expensive
    8434                 :              * either because only few buffers have been dirtied yet.
    8435                 :              */
    8436 CBC         129 :             WALInsertLockAcquireExclusive();
    8437 GNC         129 :             if (XLogCtl->Insert.lastBackupStart < state->startpoint)
    8438                 :             {
    8439             129 :                 XLogCtl->Insert.lastBackupStart = state->startpoint;
    8440 GIC         129 :                 gotUniqueStartpoint = true;
    8441                 :             }
    8442 CBC         129 :             WALInsertLockRelease();
    8443 GBC         129 :         } while (!gotUniqueStartpoint);
    8444                 : 
    8445 ECB             :         /*
    8446                 :          * Construct tablespace_map file.
    8447                 :          */
    8448 GIC         129 :         datadirpathlen = strlen(DataDir);
    8449                 : 
    8450                 :         /* Collect information about all tablespaces */
    8451 CBC         129 :         tblspcdir = AllocateDir("pg_tblspc");
    8452 GIC         414 :         while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
    8453                 :         {
    8454                 :             char        fullpath[MAXPGPATH + 10];
    8455                 :             char        linkpath[MAXPGPATH];
    8456             285 :             char       *relpath = NULL;
    8457                 :             int         rllen;
    8458                 :             StringInfoData escapedpath;
    8459                 :             char       *s;
    8460                 : 
    8461                 :             /* Skip anything that doesn't look like a tablespace */
    8462             285 :             if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    8463             269 :                 continue;
    8464                 : 
    8465              27 :             snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
    8466                 : 
    8467                 :             /*
    8468                 :              * Skip anything that isn't a symlink/junction.  For testing only,
    8469                 :              * we sometimes use allow_in_place_tablespaces to create
    8470                 :              * directories directly under pg_tblspc, which would fail below.
    8471                 :              */
    8472              27 :             if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
    8473              11 :                 continue;
    8474 ECB             : 
    8475 CBC          16 :             rllen = readlink(fullpath, linkpath, sizeof(linkpath));
    8476 GIC          16 :             if (rllen < 0)
    8477                 :             {
    8478 UIC           0 :                 ereport(WARNING,
    8479                 :                         (errmsg("could not read symbolic link \"%s\": %m",
    8480                 :                                 fullpath)));
    8481               0 :                 continue;
    8482 ECB             :             }
    8483 GIC          16 :             else if (rllen >= sizeof(linkpath))
    8484 ECB             :             {
    8485 UIC           0 :                 ereport(WARNING,
    8486                 :                         (errmsg("symbolic link \"%s\" target is too long",
    8487                 :                                 fullpath)));
    8488               0 :                 continue;
    8489                 :             }
    8490 GIC          16 :             linkpath[rllen] = '\0';
    8491                 : 
    8492                 :             /*
    8493                 :              * Build a backslash-escaped version of the link path to include
    8494                 :              * in the tablespace map file.
    8495                 :              */
    8496              16 :             initStringInfo(&escapedpath);
    8497             444 :             for (s = linkpath; *s; s++)
    8498                 :             {
    8499             428 :                 if (*s == '\n' || *s == '\r' || *s == '\\')
    8500 UIC           0 :                     appendStringInfoChar(&escapedpath, '\\');
    8501 GIC         428 :                 appendStringInfoChar(&escapedpath, *s);
    8502                 :             }
    8503                 : 
    8504                 :             /*
    8505                 :              * Relpath holds the relative path of the tablespace directory
    8506                 :              * when it's located within PGDATA, or NULL if it's located
    8507                 :              * elsewhere.
    8508                 :              */
    8509              16 :             if (rllen > datadirpathlen &&
    8510               1 :                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
    8511 LBC           0 :                 IS_DIR_SEP(linkpath[datadirpathlen]))
    8512               0 :                 relpath = linkpath + datadirpathlen + 1;
    8513                 : 
    8514 GIC          16 :             ti = palloc(sizeof(tablespaceinfo));
    8515              16 :             ti->oid = pstrdup(de->d_name);
    8516              16 :             ti->path = pstrdup(linkpath);
    8517              16 :             ti->rpath = relpath ? pstrdup(relpath) : NULL;
    8518              16 :             ti->size = -1;
    8519                 : 
    8520              16 :             if (tablespaces)
    8521              16 :                 *tablespaces = lappend(*tablespaces, ti);
    8522                 : 
    8523              16 :             appendStringInfo(tblspcmapfile, "%s %s\n",
    8524                 :                              ti->oid, escapedpath.data);
    8525                 : 
    8526              16 :             pfree(escapedpath.data);
    8527                 :         }
    8528             129 :         FreeDir(tblspcdir);
    8529                 : 
    8530 GNC         129 :         state->starttime = (pg_time_t) time(NULL);
    8531                 :     }
    8532             129 :     PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
    8533                 : 
    8534             129 :     state->started_in_recovery = backup_started_in_recovery;
    8535 ECB             : 
    8536                 :     /*
    8537                 :      * Mark that the start phase has correctly finished for the backup.
    8538                 :      */
    8539 GBC         129 :     sessionBackupState = SESSION_BACKUP_RUNNING;
    8540 GIC         129 : }
    8541                 : 
    8542                 : /*
    8543                 :  * Utility routine to fetch the session-level status of a backup running.
    8544                 :  */
    8545 ECB             : SessionBackupState
    8546 CBC         149 : get_backup_status(void)
    8547                 : {
    8548             149 :     return sessionBackupState;
    8549 ECB             : }
    8550                 : 
    8551                 : /*
    8552                 :  * do_pg_backup_stop
    8553                 :  *
    8554                 :  * Utility function called at the end of an online backup.  It creates history
    8555                 :  * file (if required), resets sessionBackupState and so on.  It can optionally
    8556                 :  * wait for WAL segments to be archived.
    8557                 :  *
    8558                 :  * "state" is filled with the information necessary to restore from this
    8559                 :  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
    8560                 :  *
    8561                 :  * It is the responsibility of the caller of this function to verify the
    8562                 :  * permissions of the calling user!
    8563                 :  */
    8564                 : void
    8565 GNC         122 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
    8566 ECB             : {
    8567 GNC         122 :     bool        backup_stopped_in_recovery = false;
    8568 ECB             :     char        histfilepath[MAXPGPATH];
    8569                 :     char        lastxlogfilename[MAXFNAMELEN];
    8570                 :     char        histfilename[MAXFNAMELEN];
    8571                 :     XLogSegNo   _logSegNo;
    8572                 :     FILE       *fp;
    8573                 :     int         seconds_before_warning;
    8574 CBC         122 :     int         waits = 0;
    8575 GIC         122 :     bool        reported_waiting = false;
    8576                 : 
    8577 GNC         122 :     Assert(state != NULL);
    8578                 : 
    8579             122 :     backup_stopped_in_recovery = RecoveryInProgress();
    8580 EUB             : 
    8581                 :     /*
    8582 ECB             :      * During recovery, we don't need to check WAL level. Because, if WAL
    8583                 :      * level is not sufficient, it's impossible to get here during recovery.
    8584 EUB             :      */
    8585 GNC         122 :     if (!backup_stopped_in_recovery && !XLogIsNeeded())
    8586 UIC           0 :         ereport(ERROR,
    8587 EUB             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8588                 :                  errmsg("WAL level not sufficient for making an online backup"),
    8589 ECB             :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
    8590                 : 
    8591                 :     /*
    8592                 :      * OK to update backup counter and session-level lock.
    8593                 :      *
    8594                 :      * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
    8595                 :      * otherwise they can be updated inconsistently, which might cause
    8596                 :      * do_pg_abort_backup() to fail.
    8597                 :      */
    8598 CBC         122 :     WALInsertLockAcquireExclusive();
    8599 EUB             : 
    8600 ECB             :     /*
    8601                 :      * It is expected that each do_pg_backup_start() call is matched by
    8602                 :      * exactly one do_pg_backup_stop() call.
    8603                 :      */
    8604 GIC         122 :     Assert(XLogCtl->Insert.runningBackups > 0);
    8605             122 :     XLogCtl->Insert.runningBackups--;
    8606                 : 
    8607                 :     /*
    8608 ECB             :      * Clean up session-level lock.
    8609                 :      *
    8610                 :      * You might think that WALInsertLockRelease() can be called before
    8611                 :      * cleaning up session-level lock because session-level lock doesn't need
    8612                 :      * to be protected with WAL insertion lock. But since
    8613                 :      * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
    8614                 :      * cleaned up before it.
    8615                 :      */
    8616 GIC         122 :     sessionBackupState = SESSION_BACKUP_NONE;
    8617 ECB             : 
    8618 GIC         122 :     WALInsertLockRelease();
    8619                 : 
    8620 ECB             :     /*
    8621                 :      * If we are taking an online backup from the standby, we confirm that the
    8622                 :      * standby has not been promoted during the backup.
    8623                 :      */
    8624 GNC         122 :     if (state->started_in_recovery && !backup_stopped_in_recovery)
    8625 UIC           0 :         ereport(ERROR,
    8626                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8627                 :                  errmsg("the standby was promoted during online backup"),
    8628                 :                  errhint("This means that the backup being taken is corrupt "
    8629                 :                          "and should not be used. "
    8630                 :                          "Try taking another online backup.")));
    8631                 : 
    8632                 :     /*
    8633                 :      * During recovery, we don't write an end-of-backup record. We assume that
    8634                 :      * pg_control was backed up last and its minimum recovery point can be
    8635                 :      * available as the backup end location. Since we don't have an
    8636                 :      * end-of-backup record, we use the pg_control value to check whether
    8637                 :      * we've reached the end of backup when starting recovery from this
    8638                 :      * backup. We have no way of checking if pg_control wasn't backed up last
    8639                 :      * however.
    8640 ECB             :      *
    8641                 :      * We don't force a switch to new WAL file but it is still possible to
    8642                 :      * wait for all the required files to be archived if waitforarchive is
    8643                 :      * true. This is okay if we use the backup to start a standby and fetch
    8644                 :      * the missing WAL using streaming replication. But in the case of an
    8645                 :      * archive recovery, a user should set waitforarchive to true and wait for
    8646                 :      * them to be archived to ensure that all the required files are
    8647                 :      * available.
    8648                 :      *
    8649                 :      * We return the current minimum recovery point as the backup end
    8650                 :      * location. Note that it can be greater than the exact backup end
    8651                 :      * location if the minimum recovery point is updated after the backup of
    8652                 :      * pg_control. This is harmless for current uses.
    8653                 :      *
    8654                 :      * XXX currently a backup history file is for informational and debug
    8655                 :      * purposes only. It's not essential for an online backup. Furthermore,
    8656                 :      * even if it's created, it will not be archived during recovery because
    8657                 :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
    8658                 :      * backup history file during recovery.
    8659                 :      */
    8660 GNC         122 :     if (backup_stopped_in_recovery)
    8661 EUB             :     {
    8662                 :         XLogRecPtr  recptr;
    8663                 : 
    8664                 :         /*
    8665                 :          * Check to see if all WAL replayed during online backup contain
    8666                 :          * full-page writes.
    8667                 :          */
    8668 GIC           5 :         SpinLockAcquire(&XLogCtl->info_lck);
    8669               5 :         recptr = XLogCtl->lastFpwDisableRecPtr;
    8670               5 :         SpinLockRelease(&XLogCtl->info_lck);
    8671                 : 
    8672 GNC           5 :         if (state->startpoint <= recptr)
    8673 LBC           0 :             ereport(ERROR,
    8674                 :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    8675                 :                      errmsg("WAL generated with full_page_writes=off was replayed "
    8676                 :                             "during online backup"),
    8677                 :                      errhint("This means that the backup being taken on the standby "
    8678                 :                              "is corrupt and should not be used. "
    8679 ECB             :                              "Enable full_page_writes and run CHECKPOINT on the primary, "
    8680                 :                              "and then try an online backup again.")));
    8681                 : 
    8682                 : 
    8683 GIC           5 :         LWLockAcquire(ControlFileLock, LW_SHARED);
    8684 GNC           5 :         state->stoppoint = ControlFile->minRecoveryPoint;
    8685               5 :         state->stoptli = ControlFile->minRecoveryPointTLI;
    8686 GIC           5 :         LWLockRelease(ControlFileLock);
    8687                 :     }
    8688                 :     else
    8689                 :     {
    8690                 :         char       *history_file;
    8691                 : 
    8692                 :         /*
    8693 ECB             :          * Write the backup-end xlog record
    8694                 :          */
    8695 CBC         117 :         XLogBeginInsert();
    8696 GNC         117 :         XLogRegisterData((char *) (&state->startpoint),
    8697                 :                          sizeof(state->startpoint));
    8698             117 :         state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
    8699                 : 
    8700                 :         /*
    8701                 :          * Given that we're not in recovery, InsertTimeLineID is set and can't
    8702 ECB             :          * change, so we can read it without a lock.
    8703 EUB             :          */
    8704 GNC         117 :         state->stoptli = XLogCtl->InsertTimeLineID;
    8705                 : 
    8706                 :         /*
    8707                 :          * Force a switch to a new xlog segment file, so that the backup is
    8708                 :          * valid as soon as archiver moves out the current segment file.
    8709                 :          */
    8710 GIC         117 :         RequestXLogSwitch(false);
    8711                 : 
    8712 GNC         117 :         state->stoptime = (pg_time_t) time(NULL);
    8713                 : 
    8714                 :         /*
    8715                 :          * Write the backup history file
    8716                 :          */
    8717             117 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    8718             117 :         BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
    8719                 :                               state->startpoint, wal_segment_size);
    8720 GIC         117 :         fp = AllocateFile(histfilepath, "w");
    8721             117 :         if (!fp)
    8722 UIC           0 :             ereport(ERROR,
    8723                 :                     (errcode_for_file_access(),
    8724                 :                      errmsg("could not create file \"%s\": %m",
    8725                 :                             histfilepath)));
    8726                 : 
    8727                 :         /* Build and save the contents of the backup history file */
    8728 GNC         117 :         history_file = build_backup_content(state, true);
    8729             117 :         fprintf(fp, "%s", history_file);
    8730             117 :         pfree(history_file);
    8731                 : 
    8732 GIC         117 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
    8733 LBC           0 :             ereport(ERROR,
    8734 ECB             :                     (errcode_for_file_access(),
    8735                 :                      errmsg("could not write file \"%s\": %m",
    8736                 :                             histfilepath)));
    8737                 : 
    8738 EUB             :         /*
    8739                 :          * Clean out any no-longer-needed history files.  As a side effect,
    8740                 :          * this will post a .ready file for the newly created history file,
    8741                 :          * notifying the archiver that history file may be archived
    8742                 :          * immediately.
    8743                 :          */
    8744 GIC         117 :         CleanupBackupHistory();
    8745                 :     }
    8746                 : 
    8747                 :     /*
    8748 ECB             :      * If archiving is enabled, wait for all the required WAL files to be
    8749                 :      * archived before returning. If archiving isn't enabled, the required WAL
    8750                 :      * needs to be transported via streaming replication (hopefully with
    8751                 :      * wal_keep_size set high enough), or some more exotic mechanism like
    8752                 :      * polling and copying files from pg_wal with script. We have no knowledge
    8753                 :      * of those mechanisms, so it's up to the user to ensure that he gets all
    8754                 :      * the required WAL.
    8755                 :      *
    8756                 :      * We wait until both the last WAL file filled during backup and the
    8757                 :      * history file have been archived, and assume that the alphabetic sorting
    8758                 :      * property of the WAL files ensures any earlier WAL files are safely
    8759                 :      * archived as well.
    8760                 :      *
    8761                 :      * We wait forever, since archive_command is supposed to work and we
    8762                 :      * assume the admin wanted his backup to work completely. If you don't
    8763                 :      * wish to wait, then either waitforarchive should be passed in as false,
    8764                 :      * or you can set statement_timeout.  Also, some notices are issued to
    8765                 :      * clue in anyone who might be doing this interactively.
    8766                 :      */
    8767                 : 
    8768 GIC         122 :     if (waitforarchive &&
    8769 GNC           8 :         ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
    8770               1 :          (backup_stopped_in_recovery && XLogArchivingAlways())))
    8771                 :     {
    8772               2 :         XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
    8773               2 :         XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
    8774                 :                      wal_segment_size);
    8775                 : 
    8776               2 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
    8777               2 :         BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
    8778                 :                               state->startpoint, wal_segment_size);
    8779                 : 
    8780 GIC           2 :         seconds_before_warning = 60;
    8781               2 :         waits = 0;
    8782                 : 
    8783 CBC           6 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
    8784               2 :                XLogArchiveIsBusy(histfilename))
    8785                 :         {
    8786               2 :             CHECK_FOR_INTERRUPTS();
    8787 ECB             : 
    8788 GBC           2 :             if (!reported_waiting && waits > 5)
    8789                 :             {
    8790 UIC           0 :                 ereport(NOTICE,
    8791                 :                         (errmsg("base backup done, waiting for required WAL segments to be archived")));
    8792               0 :                 reported_waiting = true;
    8793                 :             }
    8794 ECB             : 
    8795 CBC           2 :             (void) WaitLatch(MyLatch,
    8796 ECB             :                              WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
    8797                 :                              1000L,
    8798                 :                              WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
    8799 GBC           2 :             ResetLatch(MyLatch);
    8800                 : 
    8801 GIC           2 :             if (++waits >= seconds_before_warning)
    8802                 :             {
    8803 UIC           0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
    8804               0 :                 ereport(WARNING,
    8805                 :                         (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
    8806                 :                                 waits),
    8807                 :                          errhint("Check that your archive_command is executing properly.  "
    8808                 :                                  "You can safely cancel this backup, "
    8809                 :                                  "but the database backup will not be usable without all the WAL segments.")));
    8810 ECB             :             }
    8811                 :         }
    8812                 : 
    8813 GIC           2 :         ereport(NOTICE,
    8814                 :                 (errmsg("all required WAL segments have been archived")));
    8815                 :     }
    8816             120 :     else if (waitforarchive)
    8817               6 :         ereport(NOTICE,
    8818                 :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
    8819             122 : }
    8820                 : 
    8821                 : 
    8822                 : /*
    8823                 :  * do_pg_abort_backup: abort a running backup
    8824                 :  *
    8825                 :  * This does just the most basic steps of do_pg_backup_stop(), by taking the
    8826                 :  * system out of backup mode, thus making it a lot more safe to call from
    8827 ECB             :  * an error handler.
    8828                 :  *
    8829                 :  * 'arg' indicates that it's being called during backup setup; so
    8830                 :  * sessionBackupState has not been modified yet, but runningBackups has
    8831                 :  * already been incremented.  When it's false, then it's invoked as a
    8832                 :  * before_shmem_exit handler, and therefore we must not change state
    8833                 :  * unless sessionBackupState indicates that a backup is actually running.
    8834                 :  *
    8835                 :  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
    8836                 :  * before_shmem_exit handler, hence the odd-looking signature.
    8837                 :  */
    8838                 : void
    8839 CBC           9 : do_pg_abort_backup(int code, Datum arg)
    8840                 : {
    8841 GNC           9 :     bool        during_backup_start = DatumGetBool(arg);
    8842 ECB             : 
    8843                 :     /* If called during backup start, there shouldn't be one already running */
    8844 GNC           9 :     Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
    8845 ECB             : 
    8846 GNC           9 :     if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
    8847 EUB             :     {
    8848 GNC           7 :         WALInsertLockAcquireExclusive();
    8849               7 :         Assert(XLogCtl->Insert.runningBackups > 0);
    8850               7 :         XLogCtl->Insert.runningBackups--;
    8851                 : 
    8852               7 :         sessionBackupState = SESSION_BACKUP_NONE;
    8853               7 :         WALInsertLockRelease();
    8854                 : 
    8855               7 :         if (!during_backup_start)
    8856               7 :             ereport(WARNING,
    8857                 :                     errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
    8858                 :     }
    8859 GIC           9 : }
    8860 EUB             : 
    8861                 : /*
    8862                 :  * Register a handler that will warn about unterminated backups at end of
    8863                 :  * session, unless this has already been done.
    8864                 :  */
    8865                 : void
    8866 GIC           4 : register_persistent_abort_backup_handler(void)
    8867                 : {
    8868                 :     static bool already_done = false;
    8869                 : 
    8870 CBC           4 :     if (already_done)
    8871 GIC           1 :         return;
    8872 GNC           3 :     before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
    8873 CBC           3 :     already_done = true;
    8874 ECB             : }
    8875                 : 
    8876                 : /*
    8877                 :  * Get latest WAL insert pointer
    8878                 :  */
    8879                 : XLogRecPtr
    8880 GIC        4968 : GetXLogInsertRecPtr(void)
    8881                 : {
    8882            4968 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
    8883                 :     uint64      current_bytepos;
    8884                 : 
    8885            4968 :     SpinLockAcquire(&Insert->insertpos_lck);
    8886            4968 :     current_bytepos = Insert->CurrBytePos;
    8887            4968 :     SpinLockRelease(&Insert->insertpos_lck);
    8888                 : 
    8889            4968 :     return XLogBytePosToRecPtr(current_bytepos);
    8890                 : }
    8891                 : 
    8892                 : /*
    8893                 :  * Get latest WAL write pointer
    8894                 :  */
    8895                 : XLogRecPtr
    8896 CBC        1029 : GetXLogWriteRecPtr(void)
    8897                 : {
    8898            1029 :     SpinLockAcquire(&XLogCtl->info_lck);
    8899 GIC        1029 :     LogwrtResult = XLogCtl->LogwrtResult;
    8900            1029 :     SpinLockRelease(&XLogCtl->info_lck);
    8901 ECB             : 
    8902 GIC        1029 :     return LogwrtResult.Write;
    8903 ECB             : }
    8904                 : 
    8905                 : /*
    8906                 :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
    8907                 :  * the oldest point in WAL that we still need, if we have to restart recovery.
    8908                 :  */
    8909                 : void
    8910 CBC          53 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
    8911                 : {
    8912              53 :     LWLockAcquire(ControlFileLock, LW_SHARED);
    8913              53 :     *oldrecptr = ControlFile->checkPointCopy.redo;
    8914 GIC          53 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
    8915              53 :     LWLockRelease(ControlFileLock);
    8916 CBC          53 : }
    8917                 : 
    8918                 : /* Thin wrapper around ShutdownWalRcv(). */
    8919                 : void
    8920 GIC        1283 : XLogShutdownWalRcv(void)
    8921                 : {
    8922            1283 :     ShutdownWalRcv();
    8923 ECB             : 
    8924 GIC        1283 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8925            1283 :     XLogCtl->InstallXLogFileSegmentActive = false;
    8926            1283 :     LWLockRelease(ControlFileLock);
    8927 CBC        1283 : }
    8928 ECB             : 
    8929                 : /* Enable WAL file recycling and preallocation. */
    8930                 : void
    8931 GIC        1572 : SetInstallXLogFileSegmentActive(void)
    8932                 : {
    8933            1572 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    8934            1572 :     XLogCtl->InstallXLogFileSegmentActive = true;
    8935            1572 :     LWLockRelease(ControlFileLock);
    8936            1572 : }
    8937 ECB             : 
    8938                 : bool
    8939 CBC          24 : IsInstallXLogFileSegmentActive(void)
    8940                 : {
    8941                 :     bool        result;
    8942 ECB             : 
    8943 CBC          24 :     LWLockAcquire(ControlFileLock, LW_SHARED);
    8944              24 :     result = XLogCtl->InstallXLogFileSegmentActive;
    8945 GIC          24 :     LWLockRelease(ControlFileLock);
    8946 ECB             : 
    8947 GIC          24 :     return result;
    8948                 : }
    8949                 : 
    8950                 : /*
    8951                 :  * Update the WalWriterSleeping flag.
    8952                 :  */
    8953 ECB             : void
    8954 GIC         366 : SetWalWriterSleeping(bool sleeping)
    8955 ECB             : {
    8956 CBC         366 :     SpinLockAcquire(&XLogCtl->info_lck);
    8957             366 :     XLogCtl->WalWriterSleeping = sleeping;
    8958 GIC         366 :     SpinLockRelease(&XLogCtl->info_lck);
    8959 CBC         366 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a