LCOV - Differential Code Coverage HEAD vs 15 - src/backend/access/transam/xlog.c

LCOV - differential code coverage report

Current view:	top level - src/backend/access/transam - xlog.c (source / functions)		Coverage	Total	Hit	UNC	LBC	UIC	UBC	GBC	GIC	GNC	CBC	EUB	ECB	DUB	DCB
Current:	Differential Code Coverage HEAD vs 15	Lines:	88.4 %	2395	2116	28	63	167	21	51	1392	120	553	187	1428	20	72
Current Date:	2023-04-08 17:13:01	Functions:	99.1 %	116	115			1			106	9			112	1	3
Baseline:	15	Line coverage date bins:
Baseline Date:	2023-04-08 15:09:40	[..60] days:	86.7 %	30	26	4						26
Legend:	Lines: hit not hit	(60,120] days:	50.0 %	4	2	1	1					1	1	1	2
		(120,180] days:	88.9 %	18	16	2						16
		(180,240] days:	80.6 %	98	79	19				1	1	75	2		5
		(240..) days:	88.8 %	2245	1993	2	62	167	21	50	1391	2	550	186	1394
		Function coverage date bins:
		[..60] days:	100.0 %	1	1							1
		(180,240] days:	100.0 %	8	8							8
		(240..) days:	49.5 %	214	106			1			106				107

 Age         Owner                  TLA  Line data    Source code

                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * xlog.c
                                  4                 :  *      PostgreSQL write-ahead log manager
                                  5                 :  *
                                  6                 :  * The Write-Ahead Log (WAL) functionality is split into several source
                                  7                 :  * files, in addition to this one:
                                  8                 :  *
                                  9                 :  * xloginsert.c - Functions for constructing WAL records
                                 10                 :  * xlogrecovery.c - WAL recovery and standby code
                                 11                 :  * xlogreader.c - Facility for reading WAL files and parsing WAL records
                                 12                 :  * xlogutils.c - Helper functions for WAL redo routines
                                 13                 :  *
                                 14                 :  * This file contains functions for coordinating database startup and
                                 15                 :  * checkpointing, and managing the write-ahead log buffers when the
                                 16                 :  * system is running.
                                 17                 :  *
                                 18                 :  * StartupXLOG() is the main entry point of the startup process.  It
                                 19                 :  * coordinates database startup, performing WAL recovery, and the
                                 20                 :  * transition from WAL recovery into normal operations.
                                 21                 :  *
                                 22                 :  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
                                 23                 :  * callers should not call this directly, but use the functions in
                                 24                 :  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
                                 25                 :  * to force the WAL to disk.
                                 26                 :  *
                                 27                 :  * In addition to those, there are many other functions for interrogating
                                 28                 :  * the current system state, and for starting/stopping backups.
                                 29                 :  *
                                 30                 :  *
                                 31                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                 32                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                 33                 :  *
                                 34                 :  * src/backend/access/transam/xlog.c
                                 35                 :  *
                                 36                 :  *-------------------------------------------------------------------------
                                 37                 :  */
                                 38                 : 
                                 39                 : #include "postgres.h"
                                 40                 : 
                                 41                 : #include <ctype.h>
                                 42                 : #include <math.h>
                                 43                 : #include <time.h>
                                 44                 : #include <fcntl.h>
                                 45                 : #include <sys/stat.h>
                                 46                 : #include <sys/time.h>
                                 47                 : #include <unistd.h>
                                 48                 : 
                                 49                 : #include "access/clog.h"
                                 50                 : #include "access/commit_ts.h"
                                 51                 : #include "access/heaptoast.h"
                                 52                 : #include "access/multixact.h"
                                 53                 : #include "access/rewriteheap.h"
                                 54                 : #include "access/subtrans.h"
                                 55                 : #include "access/timeline.h"
                                 56                 : #include "access/transam.h"
                                 57                 : #include "access/twophase.h"
                                 58                 : #include "access/xact.h"
                                 59                 : #include "access/xlog_internal.h"
                                 60                 : #include "access/xlogarchive.h"
                                 61                 : #include "access/xloginsert.h"
                                 62                 : #include "access/xlogprefetcher.h"
                                 63                 : #include "access/xlogreader.h"
                                 64                 : #include "access/xlogrecovery.h"
                                 65                 : #include "access/xlogutils.h"
                                 66                 : #include "backup/basebackup.h"
                                 67                 : #include "catalog/catversion.h"
                                 68                 : #include "catalog/pg_control.h"
                                 69                 : #include "catalog/pg_database.h"
                                 70                 : #include "common/controldata_utils.h"
                                 71                 : #include "common/file_utils.h"
                                 72                 : #include "executor/instrument.h"
                                 73                 : #include "miscadmin.h"
                                 74                 : #include "pg_trace.h"
                                 75                 : #include "pgstat.h"
                                 76                 : #include "port/atomics.h"
                                 77                 : #include "port/pg_iovec.h"
                                 78                 : #include "postmaster/bgwriter.h"
                                 79                 : #include "postmaster/startup.h"
                                 80                 : #include "postmaster/walwriter.h"
                                 81                 : #include "replication/logical.h"
                                 82                 : #include "replication/origin.h"
                                 83                 : #include "replication/slot.h"
                                 84                 : #include "replication/snapbuild.h"
                                 85                 : #include "replication/walreceiver.h"
                                 86                 : #include "replication/walsender.h"
                                 87                 : #include "storage/bufmgr.h"
                                 88                 : #include "storage/fd.h"
                                 89                 : #include "storage/ipc.h"
                                 90                 : #include "storage/large_object.h"
                                 91                 : #include "storage/latch.h"
                                 92                 : #include "storage/pmsignal.h"
                                 93                 : #include "storage/predicate.h"
                                 94                 : #include "storage/proc.h"
                                 95                 : #include "storage/procarray.h"
                                 96                 : #include "storage/reinit.h"
                                 97                 : #include "storage/smgr.h"
                                 98                 : #include "storage/spin.h"
                                 99                 : #include "storage/sync.h"
                                100                 : #include "utils/guc_hooks.h"
                                101                 : #include "utils/guc_tables.h"
                                102                 : #include "utils/memutils.h"
                                103                 : #include "utils/ps_status.h"
                                104                 : #include "utils/relmapper.h"
                                105                 : #include "utils/pg_rusage.h"
                                106                 : #include "utils/snapmgr.h"
                                107                 : #include "utils/timeout.h"
                                108                 : #include "utils/timestamp.h"
                                109                 : #include "utils/varlena.h"
                                110                 : 
                                111                 : extern uint32 bootstrap_data_checksum_version;
                                112                 : 
                                113                 : /* timeline ID to be used when bootstrapping */
                                114                 : #define BootstrapTimeLineID     1
                                115                 : 
                                116                 : /* User-settable parameters */
                                117                 : int         max_wal_size_mb = 1024; /* 1 GB */
                                118                 : int         min_wal_size_mb = 80;   /* 80 MB */
                                119                 : int         wal_keep_size_mb = 0;
                                120                 : int         XLOGbuffers = -1;
                                121                 : int         XLogArchiveTimeout = 0;
                                122                 : int         XLogArchiveMode = ARCHIVE_MODE_OFF;
                                123                 : char       *XLogArchiveCommand = NULL;
                                124                 : bool        EnableHotStandby = false;
                                125                 : bool        fullPageWrites = true;
                                126                 : bool        wal_log_hints = false;
                                127                 : int         wal_compression = WAL_COMPRESSION_NONE;
                                128                 : char       *wal_consistency_checking_string = NULL;
                                129                 : bool       *wal_consistency_checking = NULL;
                                130                 : bool        wal_init_zero = true;
                                131                 : bool        wal_recycle = true;
                                132                 : bool        log_checkpoints = true;
                                133                 : int         sync_method = DEFAULT_SYNC_METHOD;
                                134                 : int         wal_level = WAL_LEVEL_REPLICA;
                                135                 : int         CommitDelay = 0;    /* precommit delay in microseconds */
                                136                 : int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
                                137                 : int         wal_retrieve_retry_interval = 5000;
                                138                 : int         max_slot_wal_keep_size_mb = -1;
                                139                 : int         wal_decode_buffer_size = 512 * 1024;
                                140                 : bool        track_wal_io_timing = false;
                                141                 : 
                                142                 : #ifdef WAL_DEBUG
                                143                 : bool        XLOG_DEBUG = false;
                                144                 : #endif
                                145                 : 
                                146                 : int         wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
                                147                 : 
                                148                 : /*
                                149                 :  * Number of WAL insertion locks to use. A higher value allows more insertions
                                150                 :  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
                                151                 :  * which needs to iterate all the locks.
                                152                 :  */
                                153                 : #define NUM_XLOGINSERT_LOCKS  8
                                154                 : 
                                155                 : /*
                                156                 :  * Max distance from last checkpoint, before triggering a new xlog-based
                                157                 :  * checkpoint.
                                158                 :  */
                                159                 : int         CheckPointSegments;
                                160                 : 
                                161                 : /* Estimated distance between checkpoints, in bytes */
                                162                 : static double CheckPointDistanceEstimate = 0;
                                163                 : static double PrevCheckPointDistance = 0;
                                164                 : 
                                165                 : /*
                                166                 :  * Track whether there were any deferred checks for custom resource managers
                                167                 :  * specified in wal_consistency_checking.
                                168                 :  */
                                169                 : static bool check_wal_consistency_checking_deferred = false;
                                170                 : 
                                171                 : /*
                                172                 :  * GUC support
                                173                 :  */
                                174                 : const struct config_enum_entry sync_method_options[] = {
                                175                 :     {"fsync", SYNC_METHOD_FSYNC, false},
                                176                 : #ifdef HAVE_FSYNC_WRITETHROUGH
                                177                 :     {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
                                178                 : #endif
                                179                 :     {"fdatasync", SYNC_METHOD_FDATASYNC, false},
                                180                 : #ifdef O_SYNC
                                181                 :     {"open_sync", SYNC_METHOD_OPEN, false},
                                182                 : #endif
                                183                 : #ifdef O_DSYNC
                                184                 :     {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
                                185                 : #endif
                                186                 :     {NULL, 0, false}
                                187                 : };
                                188                 : 
                                189                 : 
                                190                 : /*
                                191                 :  * Although only "on", "off", and "always" are documented,
                                192                 :  * we accept all the likely variants of "on" and "off".
                                193                 :  */
                                194                 : const struct config_enum_entry archive_mode_options[] = {
                                195                 :     {"always", ARCHIVE_MODE_ALWAYS, false},
                                196                 :     {"on", ARCHIVE_MODE_ON, false},
                                197                 :     {"off", ARCHIVE_MODE_OFF, false},
                                198                 :     {"true", ARCHIVE_MODE_ON, true},
                                199                 :     {"false", ARCHIVE_MODE_OFF, true},
                                200                 :     {"yes", ARCHIVE_MODE_ON, true},
                                201                 :     {"no", ARCHIVE_MODE_OFF, true},
                                202                 :     {"1", ARCHIVE_MODE_ON, true},
                                203                 :     {"0", ARCHIVE_MODE_OFF, true},
                                204                 :     {NULL, 0, false}
                                205                 : };
                                206                 : 
                                207                 : /*
                                208                 :  * Statistics for current checkpoint are collected in this global struct.
                                209                 :  * Because only the checkpointer or a stand-alone backend can perform
                                210                 :  * checkpoints, this will be unused in normal backends.
                                211                 :  */
                                212                 : CheckpointStatsData CheckpointStats;
                                213                 : 
                                214                 : /*
                                215                 :  * During recovery, lastFullPageWrites keeps track of full_page_writes that
                                216                 :  * the replayed WAL records indicate. It's initialized with full_page_writes
                                217                 :  * that the recovery starting checkpoint record indicates, and then updated
                                218                 :  * each time XLOG_FPW_CHANGE record is replayed.
                                219                 :  */
                                220                 : static bool lastFullPageWrites;
                                221                 : 
                                222                 : /*
                                223                 :  * Local copy of the state tracked by SharedRecoveryState in shared memory,
                                224                 :  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
                                225                 :  * means "not known, need to check the shared state".
                                226                 :  */
                                227                 : static bool LocalRecoveryInProgress = true;
                                228                 : 
                                229                 : /*
                                230                 :  * Local state for XLogInsertAllowed():
                                231                 :  *      1: unconditionally allowed to insert XLOG
                                232                 :  *      0: unconditionally not allowed to insert XLOG
                                233                 :  *      -1: must check RecoveryInProgress(); disallow until it is false
                                234                 :  * Most processes start with -1 and transition to 1 after seeing that recovery
                                235                 :  * is not in progress.  But we can also force the value for special cases.
                                236                 :  * The coding in XLogInsertAllowed() depends on the first two of these states
                                237                 :  * being numerically the same as bool true and false.
                                238                 :  */
                                239                 : static int  LocalXLogInsertAllowed = -1;
                                240                 : 
                                241                 : /*
                                242                 :  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
                                243                 :  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
                                244                 :  * end+1 of the last record, and is reset when we end a top-level transaction,
                                245                 :  * or start a new one; so it can be used to tell if the current transaction has
                                246                 :  * created any XLOG records.
                                247                 :  *
                                248                 :  * While in parallel mode, this may not be fully up to date.  When committing,
                                249                 :  * a transaction can assume this covers all xlog records written either by the
                                250                 :  * user backend or by any parallel worker which was present at any point during
                                251                 :  * the transaction.  But when aborting, or when still in parallel mode, other
                                252                 :  * parallel backends may have written WAL records at later LSNs than the value
                                253                 :  * stored here.  The parallel leader advances its own copy, when necessary,
                                254                 :  * in WaitForParallelWorkersToFinish.
                                255                 :  */
                                256                 : XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
                                257                 : XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
                                258                 : XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
                                259                 : 
                                260                 : /*
                                261                 :  * RedoRecPtr is this backend's local copy of the REDO record pointer
                                262                 :  * (which is almost but not quite the same as a pointer to the most recent
                                263                 :  * CHECKPOINT record).  We update this from the shared-memory copy,
                                264                 :  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
                                265                 :  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
                                266                 :  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
                                267                 :  * see GetRedoRecPtr.
                                268                 :  *
                                269                 :  * NB: Code that uses this variable must be prepared not only for the
                                270                 :  * possibility that it may be arbitrarily out of date, but also for the
                                271                 :  * possibility that it might be set to InvalidXLogRecPtr. We used to
                                272                 :  * initialize it as a side effect of the first call to RecoveryInProgress(),
                                273                 :  * which meant that most code that might use it could assume that it had a
                                274                 :  * real if perhaps stale value. That's no longer the case.
                                275                 :  */
                                276                 : static XLogRecPtr RedoRecPtr;
                                277                 : 
                                278                 : /*
                                279                 :  * doPageWrites is this backend's local copy of (fullPageWrites ||
                                280                 :  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
                                281                 :  * a full-page image of a page need to be taken.
                                282                 :  *
                                283                 :  * NB: Initially this is false, and there's no guarantee that it will be
                                284                 :  * initialized to any other value before it is first used. Any code that
                                285                 :  * makes use of it must recheck the value after obtaining a WALInsertLock,
                                286                 :  * and respond appropriately if it turns out that the previous value wasn't
                                287                 :  * accurate.
                                288                 :  */
                                289                 : static bool doPageWrites;
                                290                 : 
                                291                 : /*----------
                                292                 :  * Shared-memory data structures for XLOG control
                                293                 :  *
                                294                 :  * LogwrtRqst indicates a byte position that we need to write and/or fsync
                                295                 :  * the log up to (all records before that point must be written or fsynced).
                                296                 :  * LogwrtResult indicates the byte positions we have already written/fsynced.
                                297                 :  * These structs are identical but are declared separately to indicate their
                                298                 :  * slightly different functions.
                                299                 :  *
                                300                 :  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
                                301                 :  * WALWriteLock.  To update it, you need to hold both locks.  The point of
                                302                 :  * this arrangement is that the value can be examined by code that already
                                303                 :  * holds WALWriteLock without needing to grab info_lck as well.  In addition
                                304                 :  * to the shared variable, each backend has a private copy of LogwrtResult,
                                305                 :  * which is updated when convenient.
                                306                 :  *
                                307                 :  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
                                308                 :  * (protected by info_lck), but we don't need to cache any copies of it.
                                309                 :  *
                                310                 :  * info_lck is only held long enough to read/update the protected variables,
                                311                 :  * so it's a plain spinlock.  The other locks are held longer (potentially
                                312                 :  * over I/O operations), so we use LWLocks for them.  These locks are:
                                313                 :  *
                                314                 :  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
                                315                 :  * It is only held while initializing and changing the mapping.  If the
                                316                 :  * contents of the buffer being replaced haven't been written yet, the mapping
                                317                 :  * lock is released while the write is done, and reacquired afterwards.
                                318                 :  *
                                319                 :  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
                                320                 :  * XLogFlush).
                                321                 :  *
                                322                 :  * ControlFileLock: must be held to read/update control file or create
                                323                 :  * new log file.
                                324                 :  *
                                325                 :  *----------
                                326                 :  */
                                327                 : 
                                328                 : typedef struct XLogwrtRqst
                                329                 : {
                                330                 :     XLogRecPtr  Write;          /* last byte + 1 to write out */
                                331                 :     XLogRecPtr  Flush;          /* last byte + 1 to flush */
                                332                 : } XLogwrtRqst;
                                333                 : 
                                334                 : typedef struct XLogwrtResult
                                335                 : {
                                336                 :     XLogRecPtr  Write;          /* last byte + 1 written out */
                                337                 :     XLogRecPtr  Flush;          /* last byte + 1 flushed */
                                338                 : } XLogwrtResult;
                                339                 : 
                                340                 : /*
                                341                 :  * Inserting to WAL is protected by a small fixed number of WAL insertion
                                342                 :  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
                                343                 :  * matter which one. To lock out other concurrent insertions, you must hold
                                344                 :  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
                                345                 :  * indicator of how far the insertion has progressed (insertingAt).
                                346                 :  *
                                347                 :  * The insertingAt values are read when a process wants to flush WAL from
                                348                 :  * the in-memory buffers to disk, to check that all the insertions to the
                                349                 :  * region the process is about to write out have finished. You could simply
                                350                 :  * wait for all currently in-progress insertions to finish, but the
                                351                 :  * insertingAt indicator allows you to ignore insertions to later in the WAL,
                                352                 :  * so that you only wait for the insertions that are modifying the buffers
                                353                 :  * you're about to write out.
                                354                 :  *
                                355                 :  * This isn't just an optimization. If all the WAL buffers are dirty, an
                                356                 :  * inserter that's holding a WAL insert lock might need to evict an old WAL
                                357                 :  * buffer, which requires flushing the WAL. If it's possible for an inserter
                                358                 :  * to block on another inserter unnecessarily, deadlock can arise when two
                                359                 :  * inserters holding a WAL insert lock wait for each other to finish their
                                360                 :  * insertion.
                                361                 :  *
                                362                 :  * Small WAL records that don't cross a page boundary never update the value,
                                363                 :  * the WAL record is just copied to the page and the lock is released. But
                                364                 :  * to avoid the deadlock-scenario explained above, the indicator is always
                                365                 :  * updated before sleeping while holding an insertion lock.
                                366                 :  *
                                367                 :  * lastImportantAt contains the LSN of the last important WAL record inserted
                                368                 :  * using a given lock. This value is used to detect if there has been
                                369                 :  * important WAL activity since the last time some action, like a checkpoint,
                                370                 :  * was performed - allowing to not repeat the action if not. The LSN is
                                371                 :  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
                                372                 :  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
                                373                 :  * records.  Tracking the WAL activity directly in WALInsertLock has the
                                374                 :  * advantage of not needing any additional locks to update the value.
                                375                 :  */
                                376                 : typedef struct
                                377                 : {
                                378                 :     LWLock      lock;
                                379                 :     XLogRecPtr  insertingAt;
                                380                 :     XLogRecPtr  lastImportantAt;
                                381                 : } WALInsertLock;
                                382                 : 
                                383                 : /*
                                384                 :  * All the WAL insertion locks are allocated as an array in shared memory. We
                                385                 :  * force the array stride to be a power of 2, which saves a few cycles in
                                386                 :  * indexing, but more importantly also ensures that individual slots don't
                                387                 :  * cross cache line boundaries. (Of course, we have to also ensure that the
                                388                 :  * array start address is suitably aligned.)
                                389                 :  */
                                390                 : typedef union WALInsertLockPadded
                                391                 : {
                                392                 :     WALInsertLock l;
                                393                 :     char        pad[PG_CACHE_LINE_SIZE];
                                394                 : } WALInsertLockPadded;
                                395                 : 
                                396                 : /*
                                397                 :  * Session status of running backup, used for sanity checks in SQL-callable
                                398                 :  * functions to start and stop backups.
                                399                 :  */
                                400                 : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
                                401                 : 
                                402                 : /*
                                403                 :  * Shared state data for WAL insertion.
                                404                 :  */
                                405                 : typedef struct XLogCtlInsert
                                406                 : {
                                407                 :     slock_t     insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
                                408                 : 
                                409                 :     /*
                                410                 :      * CurrBytePos is the end of reserved WAL. The next record will be
                                411                 :      * inserted at that position. PrevBytePos is the start position of the
                                412                 :      * previously inserted (or rather, reserved) record - it is copied to the
                                413                 :      * prev-link of the next record. These are stored as "usable byte
                                414                 :      * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
                                415                 :      */
                                416                 :     uint64      CurrBytePos;
                                417                 :     uint64      PrevBytePos;
                                418                 : 
                                419                 :     /*
                                420                 :      * Make sure the above heavily-contended spinlock and byte positions are
                                421                 :      * on their own cache line. In particular, the RedoRecPtr and full page
                                422                 :      * write variables below should be on a different cache line. They are
                                423                 :      * read on every WAL insertion, but updated rarely, and we don't want
                                424                 :      * those reads to steal the cache line containing Curr/PrevBytePos.
                                425                 :      */
                                426                 :     char        pad[PG_CACHE_LINE_SIZE];
                                427                 : 
                                428                 :     /*
                                429                 :      * fullPageWrites is the authoritative value used by all backends to
                                430                 :      * determine whether to write full-page image to WAL. This shared value,
                                431                 :      * instead of the process-local fullPageWrites, is required because, when
                                432                 :      * full_page_writes is changed by SIGHUP, we must WAL-log it before it
                                433                 :      * actually affects WAL-logging by backends.  Checkpointer sets at startup
                                434                 :      * or after SIGHUP.
                                435                 :      *
                                436                 :      * To read these fields, you must hold an insertion lock. To modify them,
                                437                 :      * you must hold ALL the locks.
                                438                 :      */
                                439                 :     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
                                440                 :     bool        fullPageWrites;
                                441                 : 
                                442                 :     /*
                                443                 :      * runningBackups is a counter indicating the number of backups currently
                                444                 :      * in progress. lastBackupStart is the latest checkpoint redo location
                                445                 :      * used as a starting point for an online backup.
                                446                 :      */
                                447                 :     int         runningBackups;
                                448                 :     XLogRecPtr  lastBackupStart;
                                449                 : 
                                450                 :     /*
                                451                 :      * WAL insertion locks.
                                452                 :      */
                                453                 :     WALInsertLockPadded *WALInsertLocks;
                                454                 : } XLogCtlInsert;
                                455                 : 
                                456                 : /*
                                457                 :  * Total shared-memory state for XLOG.
                                458                 :  */
                                459                 : typedef struct XLogCtlData
                                460                 : {
                                461                 :     XLogCtlInsert Insert;
                                462                 : 
                                463                 :     /* Protected by info_lck: */
                                464                 :     XLogwrtRqst LogwrtRqst;
                                465                 :     XLogRecPtr  RedoRecPtr;     /* a recent copy of Insert->RedoRecPtr */
                                466                 :     FullTransactionId ckptFullXid;  /* nextXid of latest checkpoint */
                                467                 :     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
                                468                 :     XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
                                469                 : 
                                470                 :     XLogSegNo   lastRemovedSegNo;   /* latest removed/recycled XLOG segment */
                                471                 : 
                                472                 :     /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
                                473                 :     XLogRecPtr  unloggedLSN;
                                474                 :     slock_t     ulsn_lck;
                                475                 : 
                                476                 :     /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
                                477                 :     pg_time_t   lastSegSwitchTime;
                                478                 :     XLogRecPtr  lastSegSwitchLSN;
                                479                 : 
                                480                 :     /*
                                481                 :      * Protected by info_lck and WALWriteLock (you must hold either lock to
                                482                 :      * read it, but both to update)
                                483                 :      */
                                484                 :     XLogwrtResult LogwrtResult;
                                485                 : 
                                486                 :     /*
                                487                 :      * Latest initialized page in the cache (last byte position + 1).
                                488                 :      *
                                489                 :      * To change the identity of a buffer (and InitializedUpTo), you need to
                                490                 :      * hold WALBufMappingLock.  To change the identity of a buffer that's
                                491                 :      * still dirty, the old page needs to be written out first, and for that
                                492                 :      * you need WALWriteLock, and you need to ensure that there are no
                                493                 :      * in-progress insertions to the page by calling
                                494                 :      * WaitXLogInsertionsToFinish().
                                495                 :      */
                                496                 :     XLogRecPtr  InitializedUpTo;
                                497                 : 
                                498                 :     /*
                                499                 :      * These values do not change after startup, although the pointed-to pages
                                500                 :      * and xlblocks values certainly do.  xlblocks values are protected by
                                501                 :      * WALBufMappingLock.
                                502                 :      */
                                503                 :     char       *pages;          /* buffers for unwritten XLOG pages */
                                504                 :     XLogRecPtr *xlblocks;       /* 1st byte ptr-s + XLOG_BLCKSZ */
                                505                 :     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
                                506                 : 
                                507                 :     /*
                                508                 :      * InsertTimeLineID is the timeline into which new WAL is being inserted
                                509                 :      * and flushed. It is zero during recovery, and does not change once set.
                                510                 :      *
                                511                 :      * If we create a new timeline when the system was started up,
                                512                 :      * PrevTimeLineID is the old timeline's ID that we forked off from.
                                513                 :      * Otherwise it's equal to InsertTimeLineID.
                                514                 :      */
                                515                 :     TimeLineID  InsertTimeLineID;
                                516                 :     TimeLineID  PrevTimeLineID;
                                517                 : 
                                518                 :     /*
                                519                 :      * SharedRecoveryState indicates if we're still in crash or archive
                                520                 :      * recovery.  Protected by info_lck.
                                521                 :      */
                                522                 :     RecoveryState SharedRecoveryState;
                                523                 : 
                                524                 :     /*
                                525                 :      * InstallXLogFileSegmentActive indicates whether the checkpointer should
                                526                 :      * arrange for future segments by recycling and/or PreallocXlogFiles().
                                527                 :      * Protected by ControlFileLock.  Only the startup process changes it.  If
                                528                 :      * true, anyone can use InstallXLogFileSegment().  If false, the startup
                                529                 :      * process owns the exclusive right to install segments, by reading from
                                530                 :      * the archive and possibly replacing existing files.
                                531                 :      */
                                532                 :     bool        InstallXLogFileSegmentActive;
                                533                 : 
                                534                 :     /*
                                535                 :      * WalWriterSleeping indicates whether the WAL writer is currently in
                                536                 :      * low-power mode (and hence should be nudged if an async commit occurs).
                                537                 :      * Protected by info_lck.
                                538                 :      */
                                539                 :     bool        WalWriterSleeping;
                                540                 : 
                                541                 :     /*
                                542                 :      * During recovery, we keep a copy of the latest checkpoint record here.
                                543                 :      * lastCheckPointRecPtr points to start of checkpoint record and
                                544                 :      * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
                                545                 :      * checkpointer when it wants to create a restartpoint.
                                546                 :      *
                                547                 :      * Protected by info_lck.
                                548                 :      */
                                549                 :     XLogRecPtr  lastCheckPointRecPtr;
                                550                 :     XLogRecPtr  lastCheckPointEndPtr;
                                551                 :     CheckPoint  lastCheckPoint;
                                552                 : 
                                553                 :     /*
                                554                 :      * lastFpwDisableRecPtr points to the start of the last replayed
                                555                 :      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
                                556                 :      */
                                557                 :     XLogRecPtr  lastFpwDisableRecPtr;
                                558                 : 
                                559                 :     slock_t     info_lck;       /* locks shared variables shown above */
                                560                 : } XLogCtlData;
                                561                 : 
                                562                 : static XLogCtlData *XLogCtl = NULL;
                                563                 : 
                                564                 : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
                                565                 : static WALInsertLockPadded *WALInsertLocks = NULL;
                                566                 : 
                                567                 : /*
                                568                 :  * We maintain an image of pg_control in shared memory.
                                569                 :  */
                                570                 : static ControlFileData *ControlFile = NULL;
                                571                 : 
                                572                 : /*
                                573                 :  * Calculate the amount of space left on the page after 'endptr'. Beware
                                574                 :  * multiple evaluation!
                                575                 :  */
                                576                 : #define INSERT_FREESPACE(endptr)    \
                                577                 :     (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
                                578                 : 
                                579                 : /* Macro to advance to next buffer index. */
                                580                 : #define NextBufIdx(idx)     \
                                581                 :         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
                                582                 : 
                                583                 : /*
                                584                 :  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
                                585                 :  * would hold if it was in cache, the page containing 'recptr'.
                                586                 :  */
                                587                 : #define XLogRecPtrToBufIdx(recptr)  \
                                588                 :     (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
                                589                 : 
                                590                 : /*
                                591                 :  * These are the number of bytes in a WAL page usable for WAL data.
                                592                 :  */
                                593                 : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
                                594                 : 
                                595                 : /*
                                596                 :  * Convert values of GUCs measured in megabytes to equiv. segment count.
                                597                 :  * Rounds down.
                                598                 :  */
                                599                 : #define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
                                600                 : 
                                601                 : /* The number of bytes in a WAL segment usable for WAL data. */
                                602                 : static int  UsableBytesInSegment;
                                603                 : 
                                604                 : /*
                                605                 :  * Private, possibly out-of-date copy of shared LogwrtResult.
                                606                 :  * See discussion above.
                                607                 :  */
                                608                 : static XLogwrtResult LogwrtResult = {0, 0};
                                609                 : 
                                610                 : /*
                                611                 :  * openLogFile is -1 or a kernel FD for an open log file segment.
                                612                 :  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
                                613                 :  * These variables are only used to write the XLOG, and so will normally refer
                                614                 :  * to the active segment.
                                615                 :  *
                                616                 :  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
                                617                 :  */
                                618                 : static int  openLogFile = -1;
                                619                 : static XLogSegNo openLogSegNo = 0;
                                620                 : static TimeLineID openLogTLI = 0;
                                621                 : 
                                622                 : /*
                                623                 :  * Local copies of equivalent fields in the control file.  When running
                                624                 :  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
                                625                 :  * expect to replay all the WAL available, and updateMinRecoveryPoint is
                                626                 :  * switched to false to prevent any updates while replaying records.
                                627                 :  * Those values are kept consistent as long as crash recovery runs.
                                628                 :  */
                                629                 : static XLogRecPtr LocalMinRecoveryPoint;
                                630                 : static TimeLineID LocalMinRecoveryPointTLI;
                                631                 : static bool updateMinRecoveryPoint = true;
                                632                 : 
                                633                 : /* For WALInsertLockAcquire/Release functions */
                                634                 : static int  MyLockNo = 0;
                                635                 : static bool holdingAllLocks = false;
                                636                 : 
                                637                 : #ifdef WAL_DEBUG
                                638                 : static MemoryContext walDebugCxt = NULL;
                                639                 : #endif
                                640                 : 
                                641                 : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
                                642                 :                                         XLogRecPtr EndOfLog,
                                643                 :                                         TimeLineID newTLI);
                                644                 : static void CheckRequiredParameterValues(void);
                                645                 : static void XLogReportParameters(void);
                                646                 : static int  LocalSetXLogInsertAllowed(void);
                                647                 : static void CreateEndOfRecoveryRecord(void);
                                648                 : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
                                649                 :                                                   XLogRecPtr pagePtr,
                                650                 :                                                   TimeLineID newTLI);
                                651                 : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
                                652                 : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
                                653                 : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
                                654                 : 
                                655                 : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
                                656                 :                                   bool opportunistic);
                                657                 : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
                                658                 : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
                                659                 :                                    bool find_free, XLogSegNo max_segno,
                                660                 :                                    TimeLineID tli);
                                661                 : static void XLogFileClose(void);
                                662                 : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
                                663                 : static void RemoveTempXlogFiles(void);
                                664                 : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
                                665                 :                                XLogRecPtr endptr, TimeLineID insertTLI);
                                666                 : static void RemoveXlogFile(const struct dirent *segment_de,
                                667                 :                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
                                668                 :                            TimeLineID insertTLI);
                                669                 : static void UpdateLastRemovedPtr(char *filename);
                                670                 : static void ValidateXLOGDirectoryStructure(void);
                                671                 : static void CleanupBackupHistory(void);
                                672                 : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
                                673                 : static bool PerformRecoveryXLogAction(void);
                                674                 : static void InitControlFile(uint64 sysidentifier);
                                675                 : static void WriteControlFile(void);
                                676                 : static void ReadControlFile(void);
                                677                 : static void UpdateControlFile(void);
                                678                 : static char *str_time(pg_time_t tnow);
                                679                 : 
                                680                 : static int  get_sync_bit(int method);
                                681                 : 
                                682                 : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
                                683                 :                                 XLogRecData *rdata,
                                684                 :                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
                                685                 :                                 TimeLineID tli);
                                686                 : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
                                687                 :                                       XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
                                688                 : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
                                689                 :                               XLogRecPtr *PrevPtr);
                                690                 : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
                                691                 : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
                                692                 : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
                                693                 : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
                                694                 : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
                                695                 : 
                                696                 : static void WALInsertLockAcquire(void);
                                697                 : static void WALInsertLockAcquireExclusive(void);
                                698                 : static void WALInsertLockRelease(void);
                                699                 : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
                                700                 : 
                                701                 : /*
                                702                 :  * Insert an XLOG record represented by an already-constructed chain of data
                                703                 :  * chunks.  This is a low-level routine; to construct the WAL record header
                                704                 :  * and data, use the higher-level routines in xloginsert.c.
                                705                 :  *
                                706                 :  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
                                707                 :  * WAL record applies to, that were not included in the record as full page
                                708                 :  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
                                709                 :  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
                                710                 :  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
                                711                 :  * record is always inserted.
                                712                 :  *
                                713                 :  * 'flags' gives more in-depth control on the record being inserted. See
                                714                 :  * XLogSetRecordFlags() for details.
                                715                 :  *
                                716                 :  * 'topxid_included' tells whether the top-transaction id is logged along with
                                717                 :  * current subtransaction. See XLogRecordAssemble().
                                718                 :  *
                                719                 :  * The first XLogRecData in the chain must be for the record header, and its
                                720                 :  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
                                721                 :  * xl_crc fields in the header, the rest of the header must already be filled
                                722                 :  * by the caller.
                                723                 :  *
                                724                 :  * Returns XLOG pointer to end of record (beginning of next record).
                                725                 :  * This can be used as LSN for data pages affected by the logged action.
                                726                 :  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
                                727                 :  * before the data page can be written out.  This implements the basic
                                728                 :  * WAL rule "write the log before the data".)
                                729                 :  */
                                730                 : XLogRecPtr
 2299 andres                    731 GIC    19404394 : XLogInsertRecord(XLogRecData *rdata,
                                732                 :                  XLogRecPtr fpw_lsn,
                                733                 :                  uint8 flags,
  523 akapila                   734 ECB             :                  int num_fpi,
                                735                 :                  bool topxid_included)
                                736                 : {
 8053 bruce                     737 GIC    19404394 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
                                738                 :     pg_crc32c   rdata_crc;
                                739                 :     bool        inserted;
 3076 heikki.linnakangas        740 CBC    19404394 :     XLogRecord *rechdr = (XLogRecord *) rdata->data;
 2347 tgl                       741 GIC    19404394 :     uint8       info = rechdr->xl_info & ~XLR_INFO_MASK;
 3076 heikki.linnakangas        742        19404394 :     bool        isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
 2347 tgl                       743 ECB             :                                info == XLOG_SWITCH);
 3562 heikki.linnakangas        744                 :     XLogRecPtr  StartPos;
                                745                 :     XLogRecPtr  EndPos;
 1669 akapila                   746 GIC    19404394 :     bool        prevDoPageWrites = doPageWrites;
                                747                 :     TimeLineID  insertTLI;
                                748                 : 
 3062 heikki.linnakangas        749 ECB             :     /* we assume that all of the record header is in the first chunk */
 3062 heikki.linnakangas        750 GIC    19404394 :     Assert(rdata->len >= SizeOfXLogRecord);
                                751                 : 
                                752                 :     /* cross-check on whether we should be here or not */
 5035 tgl                       753 CBC    19404394 :     if (!XLogInsertAllowed())
 5035 tgl                       754 UIC           0 :         elog(ERROR, "cannot make new WAL entries during recovery");
                                755                 : 
  520 rhaas                     756 ECB             :     /*
  515 rhaas                     757 EUB             :      * Given that we're not in recovery, InsertTimeLineID is set and can't
                                758                 :      * change, so we can read it without a lock.
                                759                 :      */
  515 rhaas                     760 GIC    19404394 :     insertTLI = XLogCtl->InsertTimeLineID;
                                761                 : 
                                762                 :     /*----------
 3562 heikki.linnakangas        763 ECB             :      *
                                764                 :      * We have now done all the preparatory work we can without holding a
                                765                 :      * lock or modifying shared state. From here on, inserting the new WAL
                                766                 :      * record to the shared WAL buffer cache is a two-step process:
                                767                 :      *
                                768                 :      * 1. Reserve the right amount of space from the WAL. The current head of
                                769                 :      *    reserved space is kept in Insert->CurrBytePos, and is protected by
                                770                 :      *    insertpos_lck.
                                771                 :      *
                                772                 :      * 2. Copy the record to the reserved WAL space. This involves finding the
                                773                 :      *    correct WAL buffer containing the reserved space, and copying the
                                774                 :      *    record in place. This can be done concurrently in multiple processes.
                                775                 :      *
                                776                 :      * To keep track of which insertions are still in-progress, each concurrent
                                777                 :      * inserter acquires an insertion lock. In addition to just indicating that
                                778                 :      * an insertion is in progress, the lock tells others how far the inserter
                                779                 :      * has progressed. There is a small fixed number of insertion locks,
                                780                 :      * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
                                781                 :      * boundary, it updates the value stored in the lock to the how far it has
                                782                 :      * inserted, to allow the previous buffer to be flushed.
                                783                 :      *
                                784                 :      * Holding onto an insertion lock also protects RedoRecPtr and
                                785                 :      * fullPageWrites from changing until the insertion is finished.
                                786                 :      *
                                787                 :      * Step 2 can usually be done completely in parallel. If the required WAL
                                788                 :      * page is not initialized yet, you have to grab WALBufMappingLock to
                                789                 :      * initialize it, but the WAL writer tries to do that ahead of insertions
                                790                 :      * to avoid that from happening in the critical path.
                                791                 :      *
                                792                 :      *----------
                                793                 :      */
 4106 heikki.linnakangas        794 GIC    19404394 :     START_CRIT_SECTION();
 3306                           795        19404394 :     if (isLogSwitch)
                                796             442 :         WALInsertLockAcquireExclusive();
 3306 heikki.linnakangas        797 ECB             :     else
 3306 heikki.linnakangas        798 CBC    19403952 :         WALInsertLockAcquire();
 4106 heikki.linnakangas        799 ECB             : 
                                800                 :     /*
 1669 akapila                   801                 :      * Check to see if my copy of RedoRecPtr is out of date. If so, may have
                                802                 :      * to go back and have the caller recompute everything. This can only
                                803                 :      * happen just after a checkpoint, so it's better to be slow in this case
                                804                 :      * and fast otherwise.
                                805                 :      *
                                806                 :      * Also check to see if fullPageWrites was just turned on or there's a
                                807                 :      * running backup (which forces full-page writes); if we weren't already
                                808                 :      * doing full-page writes then go back and recompute.
                                809                 :      *
                                810                 :      * If we aren't doing full-page writes then RedoRecPtr doesn't actually
                                811                 :      * affect the contents of the XLOG record, so we'll update our local copy
                                812                 :      * but not force a recomputation.  (If doPageWrites was just turned off,
                                813                 :      * we could recompute the record without full pages, but we choose not to
                                814                 :      * bother.)
                                815                 :      */
 3754 alvherre                  816 GIC    19404394 :     if (RedoRecPtr != Insert->RedoRecPtr)
                                817                 :     {
                                818            4522 :         Assert(RedoRecPtr < Insert->RedoRecPtr);
 4106 heikki.linnakangas        819 CBC        4522 :         RedoRecPtr = Insert->RedoRecPtr;
                                820                 :     }
  172 alvherre                  821 GNC    19404394 :     doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
 4106 heikki.linnakangas        822 ECB             : 
 1669 akapila                   823 GIC    19404394 :     if (doPageWrites &&
 1669 akapila                   824 CBC    19214991 :         (!prevDoPageWrites ||
 1669 akapila                   825 GIC    18134561 :          (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
 4106 heikki.linnakangas        826 ECB             :     {
 3076                           827                 :         /*
 2878 bruce                     828                 :          * Oops, some buffer now needs to be backed up that the caller didn't
                                829                 :          * back up.  Start over.
                                830                 :          */
 3306 heikki.linnakangas        831 GIC        4992 :         WALInsertLockRelease();
 4106                           832            4992 :         END_CRIT_SECTION();
 3076                           833            4992 :         return InvalidXLogRecPtr;
 4106 heikki.linnakangas        834 ECB             :     }
                                835                 : 
 6997 tgl                       836                 :     /*
                                837                 :      * Reserve space for the record in the WAL. This also sets the xl_prev
                                838                 :      * pointer.
                                839                 :      */
 3562 heikki.linnakangas        840 GIC    19399402 :     if (isLogSwitch)
                                841             300 :         inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
                                842                 :     else
 3562 heikki.linnakangas        843 ECB             :     {
 3076 heikki.linnakangas        844 CBC    19399102 :         ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
                                845                 :                                   &rechdr->xl_prev);
 3562 heikki.linnakangas        846 GIC    19399102 :         inserted = true;
 3562 heikki.linnakangas        847 ECB             :     }
                                848                 : 
 3562 heikki.linnakangas        849 CBC    19399402 :     if (inserted)
                                850                 :     {
                                851                 :         /*
 3062 heikki.linnakangas        852 ECB             :          * Now that xl_prev has been filled in, calculate CRC of the record
                                853                 :          * header.
                                854                 :          */
 3062 heikki.linnakangas        855 GIC    19399350 :         rdata_crc = rechdr->xl_crc;
                                856        19399350 :         COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
 3078                           857        19399350 :         FIN_CRC32C(rdata_crc);
 3562 heikki.linnakangas        858 CBC    19399350 :         rechdr->xl_crc = rdata_crc;
 3562 heikki.linnakangas        859 ECB             : 
                                860                 :         /*
                                861                 :          * All the record data, including the header, is now ready to be
                                862                 :          * inserted. Copy the record in the space reserved.
                                863                 :          */
 3076 heikki.linnakangas        864 GIC    19399350 :         CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
                                865                 :                             StartPos, EndPos, insertTLI);
                                866                 : 
 2299 andres                    867 ECB             :         /*
                                868                 :          * Unless record is flagged as not important, update LSN of last
                                869                 :          * important record in the current slot. When holding all locks, just
                                870                 :          * update the first one.
                                871                 :          */
 2299 andres                    872 GIC    19399350 :         if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
                                873                 :         {
 2153 bruce                     874        19259839 :             int         lockno = holdingAllLocks ? 0 : MyLockNo;
 2299 andres                    875 ECB             : 
 2299 andres                    876 GIC    19259839 :             WALInsertLocks[lockno].l.lastImportantAt = StartPos;
 2299 andres                    877 ECB             :         }
                                878                 :     }
 3562 heikki.linnakangas        879                 :     else
                                880                 :     {
                                881                 :         /*
                                882                 :          * This was an xlog-switch record, but the current insert location was
                                883                 :          * already exactly at the beginning of a segment, so there was no need
                                884                 :          * to do anything.
                                885                 :          */
                                886                 :     }
                                887                 : 
                                888                 :     /*
                                889                 :      * Done! Let others know that we're finished.
                                890                 :      */
 3306 heikki.linnakangas        891 GIC    19399402 :     WALInsertLockRelease();
                                892                 : 
 3562                           893        19399402 :     END_CRIT_SECTION();
 3562 heikki.linnakangas        894 ECB             : 
  523 akapila                   895 GIC    19399402 :     MarkCurrentTransactionIdLoggedIfAny();
  523 akapila                   896 ECB             : 
                                897                 :     /*
                                898                 :      * Mark top transaction id is logged (if needed) so that we should not try
                                899                 :      * to log it again with the next WAL record in the current subtransaction.
                                900                 :      */
  523 akapila                   901 GIC    19399402 :     if (topxid_included)
                                902             234 :         MarkSubxactTopXidLogged();
                                903                 : 
 3562 heikki.linnakangas        904 ECB             :     /*
  582 alvherre                  905                 :      * Update shared LogwrtRqst.Write, if we crossed page boundary.
                                906                 :      */
 3562 heikki.linnakangas        907 GIC    19399402 :     if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
                                908                 :     {
 3121 andres                    909          444287 :         SpinLockAcquire(&XLogCtl->info_lck);
  582 alvherre                  910 ECB             :         /* advance global request to include new block(s) */
 3121 andres                    911 GIC      444287 :         if (XLogCtl->LogwrtRqst.Write < EndPos)
 3121 andres                    912 CBC      444138 :             XLogCtl->LogwrtRqst.Write = EndPos;
                                913                 :         /* update local result copy while I have the chance */
                                914          444287 :         LogwrtResult = XLogCtl->LogwrtResult;
                                915          444287 :         SpinLockRelease(&XLogCtl->info_lck);
                                916                 :     }
 3562 heikki.linnakangas        917 ECB             : 
                                918                 :     /*
                                919                 :      * If this was an XLOG_SWITCH record, flush the record and the empty
                                920                 :      * padding space that fills the rest of the segment, and perform
                                921                 :      * end-of-segment actions (eg, notifying archiver).
                                922                 :      */
 3562 heikki.linnakangas        923 GIC    19399402 :     if (isLogSwitch)
                                924                 :     {
                                925                 :         TRACE_POSTGRESQL_WAL_SWITCH();
 3562 heikki.linnakangas        926 CBC         300 :         XLogFlush(EndPos);
                                927                 : 
                                928                 :         /*
 3562 heikki.linnakangas        929 ECB             :          * Even though we reserved the rest of the segment for us, which is
                                930                 :          * reflected in EndPos, we return a pointer to just the end of the
                                931                 :          * xlog-switch record.
                                932                 :          */
 3562 heikki.linnakangas        933 GIC         300 :         if (inserted)
                                934                 :         {
                                935             248 :             EndPos = StartPos + SizeOfXLogRecord;
 3562 heikki.linnakangas        936 CBC         248 :             if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
                                937                 :             {
 2028 andres                    938 LBC           0 :                 uint64      offset = XLogSegmentOffset(EndPos, wal_segment_size);
 2028 andres                    939 ECB             : 
 2028 andres                    940 UIC           0 :                 if (offset == EndPos % XLOG_BLCKSZ)
 3562 heikki.linnakangas        941 UBC           0 :                     EndPos += SizeOfXLogLongPHD;
                                942                 :                 else
                                943               0 :                     EndPos += SizeOfXLogShortPHD;
 3562 heikki.linnakangas        944 EUB             :             }
                                945                 :         }
                                946                 :     }
                                947                 : 
                                948                 : #ifdef WAL_DEBUG
                                949                 :     if (XLOG_DEBUG)
                                950                 :     {
                                951                 :         static XLogReaderState *debug_reader = NULL;
                                952                 :         XLogRecord *record;
                                953                 :         DecodedXLogRecord *decoded;
                                954                 :         StringInfoData buf;
                                955                 :         StringInfoData recordBuf;
                                956                 :         char       *errormsg = NULL;
                                957                 :         MemoryContext oldCxt;
                                958                 : 
                                959                 :         oldCxt = MemoryContextSwitchTo(walDebugCxt);
                                960                 : 
                                961                 :         initStringInfo(&buf);
                                962                 :         appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
                                963                 : 
                                964                 :         /*
                                965                 :          * We have to piece together the WAL record data from the XLogRecData
                                966                 :          * entries, so that we can pass it to the rm_desc function as one
                                967                 :          * contiguous chunk.
                                968                 :          */
                                969                 :         initStringInfo(&recordBuf);
                                970                 :         for (; rdata != NULL; rdata = rdata->next)
                                971                 :             appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
                                972                 : 
                                973                 :         /* We also need temporary space to decode the record. */
                                974                 :         record = (XLogRecord *) recordBuf.data;
                                975                 :         decoded = (DecodedXLogRecord *)
                                976                 :             palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
                                977                 : 
                                978                 :         if (!debug_reader)
                                979                 :             debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
                                980                 :                                               XL_ROUTINE(), NULL);
                                981                 : 
                                982                 :         if (!debug_reader)
                                983                 :         {
                                984                 :             appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
                                985                 :         }
                                986                 :         else if (!DecodeXLogRecord(debug_reader,
                                987                 :                                    decoded,
                                988                 :                                    record,
                                989                 :                                    EndPos,
                                990                 :                                    &errormsg))
                                991                 :         {
                                992                 :             appendStringInfo(&buf, "error decoding record: %s",
                                993                 :                              errormsg ? errormsg : "no error message");
                                994                 :         }
                                995                 :         else
                                996                 :         {
                                997                 :             appendStringInfoString(&buf, " - ");
                                998                 : 
                                999                 :             debug_reader->record = decoded;
                               1000                 :             xlog_outdesc(&buf, debug_reader);
                               1001                 :             debug_reader->record = NULL;
                               1002                 :         }
                               1003                 :         elog(LOG, "%s", buf.data);
                               1004                 : 
                               1005                 :         pfree(decoded);
                               1006                 :         pfree(buf.data);
                               1007                 :         pfree(recordBuf.data);
                               1008                 :         MemoryContextSwitchTo(oldCxt);
                               1009                 :     }
                               1010                 : #endif
                               1011                 : 
                               1012                 :     /*
                               1013                 :      * Update our global variables
                               1014                 :      */
 3562 heikki.linnakangas       1015 GIC    19399402 :     ProcLastRecPtr = StartPos;
                               1016        19399402 :     XactLastRecEnd = EndPos;
                               1017                 : 
 1100 akapila                  1018 ECB             :     /* Report WAL traffic to the instrumentation. */
 1100 akapila                  1019 CBC    19399402 :     if (inserted)
                               1020                 :     {
 1100 akapila                  1021 GIC    19399350 :         pgWalUsage.wal_bytes += rechdr->xl_tot_len;
 1100 akapila                  1022 CBC    19399350 :         pgWalUsage.wal_records++;
 1069 akapila                  1023 GIC    19399350 :         pgWalUsage.wal_fpi += num_fpi;
 1100 akapila                  1024 ECB             :     }
                               1025                 : 
 3562 heikki.linnakangas       1026 CBC    19399402 :     return EndPos;
                               1027                 : }
                               1028                 : 
 3562 heikki.linnakangas       1029 ECB             : /*
                               1030                 :  * Reserves the right amount of space for a record of given size from the WAL.
                               1031                 :  * *StartPos is set to the beginning of the reserved section, *EndPos to
                               1032                 :  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
                               1033                 :  * used to set the xl_prev of this record.
                               1034                 :  *
                               1035                 :  * This is the performance critical part of XLogInsert that must be serialized
                               1036                 :  * across backends. The rest can happen mostly in parallel. Try to keep this
                               1037                 :  * section as short as possible, insertpos_lck can be heavily contended on a
                               1038                 :  * busy system.
                               1039                 :  *
                               1040                 :  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
                               1041                 :  * where we actually copy the record to the reserved space.
                               1042                 :  */
                               1043                 : static void
 3562 heikki.linnakangas       1044 GIC    19399102 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
                               1045                 :                           XLogRecPtr *PrevPtr)
                               1046                 : {
 3121 andres                   1047 CBC    19399102 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
                               1048                 :     uint64      startbytepos;
                               1049                 :     uint64      endbytepos;
 3562 heikki.linnakangas       1050 ECB             :     uint64      prevbytepos;
                               1051                 : 
 3562 heikki.linnakangas       1052 GIC    19399102 :     size = MAXALIGN(size);
                               1053                 : 
                               1054                 :     /* All (non xlog-switch) records should contain data. */
 3562 heikki.linnakangas       1055 CBC    19399102 :     Assert(size > SizeOfXLogRecord);
                               1056                 : 
                               1057                 :     /*
 3562 heikki.linnakangas       1058 ECB             :      * The duration the spinlock needs to be held is minimized by minimizing
                               1059                 :      * the calculations that have to be done while holding the lock. The
                               1060                 :      * current tip of reserved WAL is kept in CurrBytePos, as a byte position
                               1061                 :      * that only counts "usable" bytes in WAL, that is, it excludes all WAL
                               1062                 :      * page headers. The mapping between "usable" byte positions and physical
                               1063                 :      * positions (XLogRecPtrs) can be done outside the locked region, and
                               1064                 :      * because the usable byte position doesn't include any headers, reserving
                               1065                 :      * X bytes from WAL is almost as simple as "CurrBytePos += X".
                               1066                 :      */
 3562 heikki.linnakangas       1067 GIC    19399102 :     SpinLockAcquire(&Insert->insertpos_lck);
                               1068                 : 
                               1069        19399102 :     startbytepos = Insert->CurrBytePos;
 3562 heikki.linnakangas       1070 CBC    19399102 :     endbytepos = startbytepos + size;
 3562 heikki.linnakangas       1071 GIC    19399102 :     prevbytepos = Insert->PrevBytePos;
 3562 heikki.linnakangas       1072 CBC    19399102 :     Insert->CurrBytePos = endbytepos;
                               1073        19399102 :     Insert->PrevBytePos = startbytepos;
 3562 heikki.linnakangas       1074 ECB             : 
 3562 heikki.linnakangas       1075 CBC    19399102 :     SpinLockRelease(&Insert->insertpos_lck);
 3562 heikki.linnakangas       1076 ECB             : 
 3562 heikki.linnakangas       1077 GIC    19399102 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
 3562 heikki.linnakangas       1078 CBC    19399102 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
 3562 heikki.linnakangas       1079 GIC    19399102 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
 3562 heikki.linnakangas       1080 ECB             : 
                               1081                 :     /*
                               1082                 :      * Check that the conversions between "usable byte positions" and
                               1083                 :      * XLogRecPtrs work consistently in both directions.
                               1084                 :      */
 3562 heikki.linnakangas       1085 GIC    19399102 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
                               1086        19399102 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
                               1087        19399102 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
 3562 heikki.linnakangas       1088 CBC    19399102 : }
 3562 heikki.linnakangas       1089 ECB             : 
                               1090                 : /*
                               1091                 :  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
                               1092                 :  *
                               1093                 :  * A log-switch record is handled slightly differently. The rest of the
                               1094                 :  * segment will be reserved for this insertion, as indicated by the returned
                               1095                 :  * *EndPos value. However, if we are already at the beginning of the current
                               1096                 :  * segment, *StartPos and *EndPos are set to the current location without
                               1097                 :  * reserving any space, and the function returns false.
                               1098                 : */
                               1099                 : static bool
 3562 heikki.linnakangas       1100 GIC         300 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
                               1101                 : {
 3121 andres                   1102             300 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
 3562 heikki.linnakangas       1103 ECB             :     uint64      startbytepos;
                               1104                 :     uint64      endbytepos;
                               1105                 :     uint64      prevbytepos;
 3062 heikki.linnakangas       1106 GIC         300 :     uint32      size = MAXALIGN(SizeOfXLogRecord);
                               1107                 :     XLogRecPtr  ptr;
                               1108                 :     uint32      segleft;
 3562 heikki.linnakangas       1109 ECB             : 
                               1110                 :     /*
                               1111                 :      * These calculations are a bit heavy-weight to be done while holding a
                               1112                 :      * spinlock, but since we're holding all the WAL insertion locks, there
                               1113                 :      * are no other inserters competing for it. GetXLogInsertRecPtr() does
                               1114                 :      * compete for it, but that's not called very frequently.
                               1115                 :      */
 3562 heikki.linnakangas       1116 GIC         300 :     SpinLockAcquire(&Insert->insertpos_lck);
                               1117                 : 
                               1118             300 :     startbytepos = Insert->CurrBytePos;
 3562 heikki.linnakangas       1119 ECB             : 
 3562 heikki.linnakangas       1120 GIC         300 :     ptr = XLogBytePosToEndRecPtr(startbytepos);
 2028 andres                   1121 CBC         300 :     if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
                               1122                 :     {
 3562 heikki.linnakangas       1123              52 :         SpinLockRelease(&Insert->insertpos_lck);
                               1124              52 :         *EndPos = *StartPos = ptr;
 3562 heikki.linnakangas       1125 GIC          52 :         return false;
 3562 heikki.linnakangas       1126 ECB             :     }
                               1127                 : 
 3562 heikki.linnakangas       1128 CBC         248 :     endbytepos = startbytepos + size;
 3562 heikki.linnakangas       1129 GIC         248 :     prevbytepos = Insert->PrevBytePos;
                               1130                 : 
 3562 heikki.linnakangas       1131 CBC         248 :     *StartPos = XLogBytePosToRecPtr(startbytepos);
                               1132             248 :     *EndPos = XLogBytePosToEndRecPtr(endbytepos);
                               1133                 : 
 2028 andres                   1134             248 :     segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
                               1135             248 :     if (segleft != wal_segment_size)
                               1136                 :     {
 3562 heikki.linnakangas       1137 ECB             :         /* consume the rest of the segment */
 3562 heikki.linnakangas       1138 CBC         248 :         *EndPos += segleft;
 3562 heikki.linnakangas       1139 GIC         248 :         endbytepos = XLogRecPtrToBytePos(*EndPos);
                               1140                 :     }
 3562 heikki.linnakangas       1141 CBC         248 :     Insert->CurrBytePos = endbytepos;
                               1142             248 :     Insert->PrevBytePos = startbytepos;
                               1143                 : 
                               1144             248 :     SpinLockRelease(&Insert->insertpos_lck);
 3562 heikki.linnakangas       1145 ECB             : 
 3562 heikki.linnakangas       1146 GIC         248 :     *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
 3562 heikki.linnakangas       1147 ECB             : 
 2028 andres                   1148 GIC         248 :     Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
 3562 heikki.linnakangas       1149 CBC         248 :     Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
 3562 heikki.linnakangas       1150 GIC         248 :     Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
 3562 heikki.linnakangas       1151 CBC         248 :     Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
 3562 heikki.linnakangas       1152 ECB             : 
 3562 heikki.linnakangas       1153 CBC         248 :     return true;
 3562 heikki.linnakangas       1154 ECB             : }
                               1155                 : 
                               1156                 : /*
                               1157                 :  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
                               1158                 :  * area in the WAL.
                               1159                 :  */
                               1160                 : static void
 3562 heikki.linnakangas       1161 GIC    19399350 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
                               1162                 :                     XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
                               1163                 : {
 3562 heikki.linnakangas       1164 ECB             :     char       *currpos;
                               1165                 :     int         freespace;
                               1166                 :     int         written;
                               1167                 :     XLogRecPtr  CurrPos;
                               1168                 :     XLogPageHeader pagehdr;
                               1169                 : 
                               1170                 :     /*
                               1171                 :      * Get a pointer to the right place in the right WAL buffer to start
                               1172                 :      * inserting to.
                               1173                 :      */
 3562 heikki.linnakangas       1174 GIC    19399350 :     CurrPos = StartPos;
  520 rhaas                    1175        19399350 :     currpos = GetXLogBuffer(CurrPos, tli);
 3562 heikki.linnakangas       1176        19399350 :     freespace = INSERT_FREESPACE(CurrPos);
 3562 heikki.linnakangas       1177 ECB             : 
                               1178                 :     /*
                               1179                 :      * there should be enough space for at least the first field (xl_tot_len)
                               1180                 :      * on this page.
                               1181                 :      */
 3562 heikki.linnakangas       1182 GIC    19399350 :     Assert(freespace >= sizeof(uint32));
                               1183                 : 
                               1184                 :     /* Copy record data */
 3562 heikki.linnakangas       1185 CBC    19399350 :     written = 0;
 3562 heikki.linnakangas       1186 GIC    85337519 :     while (rdata != NULL)
                               1187                 :     {
 3562 heikki.linnakangas       1188 CBC    65938169 :         char       *rdata_data = rdata->data;
                               1189        65938169 :         int         rdata_len = rdata->len;
                               1190                 : 
                               1191        66429163 :         while (rdata_len > freespace)
 3562 heikki.linnakangas       1192 ECB             :         {
                               1193                 :             /*
                               1194                 :              * Write what fits on this page, and continue on the next page.
                               1195                 :              */
 3562 heikki.linnakangas       1196 GIC      490994 :             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
                               1197          490994 :             memcpy(currpos, rdata_data, freespace);
                               1198          490994 :             rdata_data += freespace;
 3562 heikki.linnakangas       1199 CBC      490994 :             rdata_len -= freespace;
                               1200          490994 :             written += freespace;
                               1201          490994 :             CurrPos += freespace;
 3562 heikki.linnakangas       1202 ECB             : 
                               1203                 :             /*
                               1204                 :              * Get pointer to beginning of next page, and set the xlp_rem_len
                               1205                 :              * in the page header. Set XLP_FIRST_IS_CONTRECORD.
                               1206                 :              *
                               1207                 :              * It's safe to set the contrecord flag and xlp_rem_len without a
                               1208                 :              * lock on the page. All the other flags were already set when the
                               1209                 :              * page was initialized, in AdvanceXLInsertBuffer, and we're the
                               1210                 :              * only backend that needs to set the contrecord flag.
                               1211                 :              */
  520 rhaas                    1212 GIC      490994 :             currpos = GetXLogBuffer(CurrPos, tli);
 3562 heikki.linnakangas       1213          490994 :             pagehdr = (XLogPageHeader) currpos;
                               1214          490994 :             pagehdr->xlp_rem_len = write_len - written;
 3562 heikki.linnakangas       1215 CBC      490994 :             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 3562 heikki.linnakangas       1216 ECB             : 
                               1217                 :             /* skip over the page header */
 2028 andres                   1218 CBC      490994 :             if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
                               1219                 :             {
 3562 heikki.linnakangas       1220 GIC         426 :                 CurrPos += SizeOfXLogLongPHD;
 3562 heikki.linnakangas       1221 CBC         426 :                 currpos += SizeOfXLogLongPHD;
                               1222                 :             }
 3562 heikki.linnakangas       1223 ECB             :             else
                               1224                 :             {
 3562 heikki.linnakangas       1225 GIC      490568 :                 CurrPos += SizeOfXLogShortPHD;
                               1226          490568 :                 currpos += SizeOfXLogShortPHD;
                               1227                 :             }
 3562 heikki.linnakangas       1228 CBC      490994 :             freespace = INSERT_FREESPACE(CurrPos);
 3562 heikki.linnakangas       1229 ECB             :         }
                               1230                 : 
 3562 heikki.linnakangas       1231 CBC    65938169 :         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
 3562 heikki.linnakangas       1232 GIC    65938169 :         memcpy(currpos, rdata_data, rdata_len);
                               1233        65938169 :         currpos += rdata_len;
 3562 heikki.linnakangas       1234 CBC    65938169 :         CurrPos += rdata_len;
                               1235        65938169 :         freespace -= rdata_len;
                               1236        65938169 :         written += rdata_len;
 3562 heikki.linnakangas       1237 ECB             : 
 3562 heikki.linnakangas       1238 CBC    65938169 :         rdata = rdata->next;
 3562 heikki.linnakangas       1239 ECB             :     }
 3562 heikki.linnakangas       1240 GIC    19399350 :     Assert(written == write_len);
 3562 heikki.linnakangas       1241 ECB             : 
                               1242                 :     /*
                               1243                 :      * If this was an xlog-switch, it's not enough to write the switch record,
                               1244                 :      * we also have to consume all the remaining space in the WAL segment.  We
                               1245                 :      * have already reserved that space, but we need to actually fill it.
                               1246                 :      */
 2028 andres                   1247 GIC    19399350 :     if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
                               1248                 :     {
                               1249                 :         /* An xlog-switch record doesn't contain any data besides the header */
 3562 heikki.linnakangas       1250 CBC         248 :         Assert(write_len == SizeOfXLogRecord);
                               1251                 : 
                               1252                 :         /* Assert that we did reserve the right amount of space */
 2028 andres                   1253             248 :         Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
                               1254                 : 
                               1255                 :         /* Use up all the remaining space on the current page */
 3562 heikki.linnakangas       1256             248 :         CurrPos += freespace;
                               1257                 : 
                               1258                 :         /*
 1836 tgl                      1259 ECB             :          * Cause all remaining pages in the segment to be flushed, leaving the
                               1260                 :          * XLog position where it should be, at the start of the next segment.
                               1261                 :          * We do this one page at a time, to make sure we don't deadlock
                               1262                 :          * against ourselves if wal_buffers < wal_segment_size.
                               1263                 :          */
 3562 heikki.linnakangas       1264 GIC      384232 :         while (CurrPos < EndPos)
                               1265                 :         {
                               1266                 :             /*
 1836 tgl                      1267 ECB             :              * The minimal action to flush the page would be to call
                               1268                 :              * WALInsertLockUpdateInsertingAt(CurrPos) followed by
                               1269                 :              * AdvanceXLInsertBuffer(...).  The page would be left initialized
                               1270                 :              * mostly to zeros, except for the page header (always the short
                               1271                 :              * variant, as this is never a segment's first page).
                               1272                 :              *
                               1273                 :              * The large vistas of zeros are good for compressibility, but the
                               1274                 :              * headers interrupting them every XLOG_BLCKSZ (with values that
                               1275                 :              * differ from page to page) are not.  The effect varies with
                               1276                 :              * compression tool, but bzip2 for instance compresses about an
                               1277                 :              * order of magnitude worse if those headers are left in place.
                               1278                 :              *
                               1279                 :              * Rather than complicating AdvanceXLInsertBuffer itself (which is
                               1280                 :              * called in heavily-loaded circumstances as well as this lightly-
                               1281                 :              * loaded one) with variant behavior, we just use GetXLogBuffer
                               1282                 :              * (which itself calls the two methods we need) to get the pointer
                               1283                 :              * and zero most of the page.  Then we just zero the page header.
                               1284                 :              */
  520 rhaas                    1285 GIC      383984 :             currpos = GetXLogBuffer(CurrPos, tli);
 1836 tgl                      1286         1535936 :             MemSet(currpos, 0, SizeOfXLogShortPHD);
                               1287                 : 
 3562 heikki.linnakangas       1288 CBC      383984 :             CurrPos += XLOG_BLCKSZ;
 3562 heikki.linnakangas       1289 ECB             :         }
                               1290                 :     }
 3062                          1291                 :     else
                               1292                 :     {
                               1293                 :         /* Align the end position, so that the next record starts aligned */
 3062 heikki.linnakangas       1294 GIC    19399102 :         CurrPos = MAXALIGN64(CurrPos);
                               1295                 :     }
                               1296                 : 
 3562 heikki.linnakangas       1297 CBC    19399350 :     if (CurrPos != EndPos)
 3562 heikki.linnakangas       1298 UIC           0 :         elog(PANIC, "space reserved for WAL record does not match what was written");
 3562 heikki.linnakangas       1299 GIC    19399350 : }
 3562 heikki.linnakangas       1300 ECB             : 
 3562 heikki.linnakangas       1301 EUB             : /*
 3306 heikki.linnakangas       1302 ECB             :  * Acquire a WAL insertion lock, for inserting to WAL.
                               1303                 :  */
                               1304                 : static void
 3306 heikki.linnakangas       1305 GIC    19403953 : WALInsertLockAcquire(void)
                               1306                 : {
                               1307                 :     bool        immed;
 3562 heikki.linnakangas       1308 ECB             : 
                               1309                 :     /*
                               1310                 :      * It doesn't matter which of the WAL insertion locks we acquire, so try
                               1311                 :      * the one we used last time.  If the system isn't particularly busy, it's
                               1312                 :      * a good bet that it's still available, and it's good to have some
                               1313                 :      * affinity to a particular lock so that you don't unnecessarily bounce
                               1314                 :      * cache lines between processes when there's no contention.
                               1315                 :      *
                               1316                 :      * If this is the first time through in this backend, pick a lock
                               1317                 :      * (semi-)randomly.  This allows the locks to be used evenly if you have a
                               1318                 :      * lot of very short connections.
                               1319                 :      */
                               1320                 :     static int  lockToTry = -1;
                               1321                 : 
 3306 heikki.linnakangas       1322 GIC    19403953 :     if (lockToTry == -1)
 3112                          1323            5924 :         lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
 3306                          1324        19403953 :     MyLockNo = lockToTry;
 3562 heikki.linnakangas       1325 ECB             : 
                               1326                 :     /*
 3306                          1327                 :      * The insertingAt value is initially set to 0, as we don't know our
                               1328                 :      * insert location yet.
                               1329                 :      */
 2809 andres                   1330 GIC    19403953 :     immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
 3306 heikki.linnakangas       1331        19403953 :     if (!immed)
                               1332                 :     {
 3562 heikki.linnakangas       1333 ECB             :         /*
 3306                          1334                 :          * If we couldn't get the lock immediately, try another lock next
                               1335                 :          * time.  On a system with more insertion locks than concurrent
                               1336                 :          * inserters, this causes all the inserters to eventually migrate to a
                               1337                 :          * lock that no-one else is using.  On a system with more inserters
                               1338                 :          * than locks, it still helps to distribute the inserters evenly
                               1339                 :          * across the locks.
                               1340                 :          */
 3112 heikki.linnakangas       1341 GIC         635 :         lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
                               1342                 :     }
 3562                          1343        19403953 : }
 3562 heikki.linnakangas       1344 ECB             : 
                               1345                 : /*
 3306                          1346                 :  * Acquire all WAL insertion locks, to prevent other backends from inserting
                               1347                 :  * to WAL.
                               1348                 :  */
                               1349                 : static void
 3306 heikki.linnakangas       1350 GIC        3502 : WALInsertLockAcquireExclusive(void)
                               1351                 : {
                               1352                 :     int         i;
 3562 heikki.linnakangas       1353 ECB             : 
                               1354                 :     /*
                               1355                 :      * When holding all the locks, all but the last lock's insertingAt
                               1356                 :      * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
                               1357                 :      * XLogRecPtr value, to make sure that no-one blocks waiting on those.
                               1358                 :      */
 3112 heikki.linnakangas       1359 GIC       28016 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
                               1360                 :     {
 2809 andres                   1361           24514 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
 2809 andres                   1362 CBC       24514 :         LWLockUpdateVar(&WALInsertLocks[i].l.lock,
 2809 andres                   1363 GIC       24514 :                         &WALInsertLocks[i].l.insertingAt,
 2809 andres                   1364 ECB             :                         PG_UINT64_MAX);
 3562 heikki.linnakangas       1365                 :     }
 2809 andres                   1366                 :     /* Variable value reset to 0 at release */
 2809 andres                   1367 GIC        3502 :     LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
                               1368                 : 
 3306 heikki.linnakangas       1369            3502 :     holdingAllLocks = true;
 3562 heikki.linnakangas       1370 CBC        3502 : }
                               1371                 : 
 3562 heikki.linnakangas       1372 ECB             : /*
 3306                          1373                 :  * Release our insertion lock (or locks, if we're holding them all).
                               1374                 :  *
                               1375                 :  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
                               1376                 :  * next time the lock is acquired.
                               1377                 :  */
                               1378                 : static void
 3306 heikki.linnakangas       1379 GIC    19407455 : WALInsertLockRelease(void)
                               1380                 : {
                               1381        19407455 :     if (holdingAllLocks)
 3562 heikki.linnakangas       1382 ECB             :     {
                               1383                 :         int         i;
 3306                          1384                 : 
 3112 heikki.linnakangas       1385 GIC       31518 :         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
 2809 andres                   1386           28016 :             LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
                               1387           28016 :                                   &WALInsertLocks[i].l.insertingAt,
 2809 andres                   1388 ECB             :                                   0);
 3562 heikki.linnakangas       1389                 : 
 3306 heikki.linnakangas       1390 CBC        3502 :         holdingAllLocks = false;
                               1391                 :     }
                               1392                 :     else
 3562 heikki.linnakangas       1393 ECB             :     {
 2809 andres                   1394 GIC    19403953 :         LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
                               1395        19403953 :                               &WALInsertLocks[MyLockNo].l.insertingAt,
                               1396                 :                               0);
 3562 heikki.linnakangas       1397 ECB             :     }
 3562 heikki.linnakangas       1398 CBC    19407455 : }
                               1399                 : 
                               1400                 : /*
 3306 heikki.linnakangas       1401 ECB             :  * Update our insertingAt value, to let others know that we've finished
                               1402                 :  * inserting up to that point.
                               1403                 :  */
                               1404                 : static void
 3306 heikki.linnakangas       1405 GIC      742721 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
                               1406                 : {
                               1407          742721 :     if (holdingAllLocks)
 3562 heikki.linnakangas       1408 ECB             :     {
                               1409                 :         /*
 3306                          1410                 :          * We use the last lock to mark our actual position, see comments in
                               1411                 :          * WALInsertLockAcquireExclusive.
                               1412                 :          */
 3112 heikki.linnakangas       1413 GIC      380439 :         LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
 2118 tgl                      1414          380439 :                         &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
                               1415                 :                         insertingAt);
 3562 heikki.linnakangas       1416 ECB             :     }
                               1417                 :     else
 3306 heikki.linnakangas       1418 GIC      362282 :         LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
                               1419          362282 :                         &WALInsertLocks[MyLockNo].l.insertingAt,
                               1420                 :                         insertingAt);
 3562 heikki.linnakangas       1421 CBC      742721 : }
 3562 heikki.linnakangas       1422 ECB             : 
                               1423                 : /*
                               1424                 :  * Wait for any WAL insertions < upto to finish.
                               1425                 :  *
                               1426                 :  * Returns the location of the oldest insertion that is still in-progress.
                               1427                 :  * Any WAL prior to that point has been fully copied into WAL buffers, and
                               1428                 :  * can be flushed out to disk. Because this waits for any insertions older
                               1429                 :  * than 'upto' to finish, the return value is always >= 'upto'.
                               1430                 :  *
                               1431                 :  * Note: When you are about to write out WAL, you must call this function
                               1432                 :  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
                               1433                 :  * need to wait for an insertion to finish (or at least advance to next
                               1434                 :  * uninitialized page), and the inserter might need to evict an old WAL buffer
                               1435                 :  * to make room for a new one, which in turn requires WALWriteLock.
                               1436                 :  */
                               1437                 : static XLogRecPtr
 3562 heikki.linnakangas       1438 GIC      737182 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
                               1439                 : {
                               1440                 :     uint64      bytepos;
 3562 heikki.linnakangas       1441 ECB             :     XLogRecPtr  reservedUpto;
                               1442                 :     XLogRecPtr  finishedUpto;
 3121 andres                   1443 GIC      737182 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
                               1444                 :     int         i;
                               1445                 : 
 3562 heikki.linnakangas       1446 CBC      737182 :     if (MyProc == NULL)
 3562 heikki.linnakangas       1447 UIC           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
                               1448                 : 
 3562 heikki.linnakangas       1449 ECB             :     /* Read the current insert position */
 3562 heikki.linnakangas       1450 GBC      737182 :     SpinLockAcquire(&Insert->insertpos_lck);
 3562 heikki.linnakangas       1451 GIC      737182 :     bytepos = Insert->CurrBytePos;
                               1452          737182 :     SpinLockRelease(&Insert->insertpos_lck);
 3562 heikki.linnakangas       1453 CBC      737182 :     reservedUpto = XLogBytePosToEndRecPtr(bytepos);
 3562 heikki.linnakangas       1454 ECB             : 
                               1455                 :     /*
                               1456                 :      * No-one should request to flush a piece of WAL that hasn't even been
                               1457                 :      * reserved yet. However, it can happen if there is a block with a bogus
                               1458                 :      * LSN on disk, for example. XLogFlush checks for that situation and
                               1459                 :      * complains, but only after the flush. Here we just assume that to mean
                               1460                 :      * that all WAL that has been reserved needs to be finished. In this
                               1461                 :      * corner-case, the return value can be smaller than 'upto' argument.
                               1462                 :      */
 3562 heikki.linnakangas       1463 GIC      737182 :     if (upto > reservedUpto)
                               1464                 :     {
  856 peter                    1465 UIC           0 :         ereport(LOG,
  856 peter                    1466 ECB             :                 (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
                               1467                 :                         LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
 3562 heikki.linnakangas       1468 UBC           0 :         upto = reservedUpto;
                               1469                 :     }
                               1470                 : 
 3562 heikki.linnakangas       1471 EUB             :     /*
                               1472                 :      * Loop through all the locks, sleeping on any in-progress insert older
                               1473                 :      * than 'upto'.
                               1474                 :      *
                               1475                 :      * finishedUpto is our return value, indicating the point upto which all
                               1476                 :      * the WAL insertions have been finished. Initialize it to the head of
                               1477                 :      * reserved WAL, and as we iterate through the insertion locks, back it
                               1478                 :      * out for any insertion that's still in progress.
                               1479                 :      */
 3562 heikki.linnakangas       1480 GIC      737182 :     finishedUpto = reservedUpto;
 3112                          1481         6634638 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
                               1482                 :     {
 3260 bruce                    1483 CBC     5897456 :         XLogRecPtr  insertingat = InvalidXLogRecPtr;
 3260 bruce                    1484 ECB             : 
                               1485                 :         do
 3562 heikki.linnakangas       1486                 :         {
                               1487                 :             /*
                               1488                 :              * See if this insertion is in progress.  LWLockWaitForVar will
                               1489                 :              * wait for the lock to be released, or for the 'value' to be set
                               1490                 :              * by a LWLockUpdateVar call.  When a lock is initially acquired,
                               1491                 :              * its value is 0 (InvalidXLogRecPtr), which means that we don't
                               1492                 :              * know where it's inserting yet.  We will have to wait for it. If
                               1493                 :              * it's a small insertion, the record will most likely fit on the
                               1494                 :              * same page and the inserter will release the lock without ever
                               1495                 :              * calling LWLockUpdateVar.  But if it has to sleep, it will
                               1496                 :              * advertise the insertion point with LWLockUpdateVar before
                               1497                 :              * sleeping.
                               1498                 :              */
 3306 heikki.linnakangas       1499 GIC     5899012 :             if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
                               1500         5899012 :                                  &WALInsertLocks[i].l.insertingAt,
                               1501                 :                                  insertingat, &insertingat))
 3306 heikki.linnakangas       1502 ECB             :             {
                               1503                 :                 /* the lock was free, so no insertion in progress */
 3306 heikki.linnakangas       1504 GIC     2958401 :                 insertingat = InvalidXLogRecPtr;
                               1505         2958401 :                 break;
                               1506                 :             }
 3562 heikki.linnakangas       1507 ECB             : 
                               1508                 :             /*
                               1509                 :              * This insertion is still in progress. Have to wait, unless the
                               1510                 :              * inserter has proceeded past 'upto'.
                               1511                 :              */
 3306 heikki.linnakangas       1512 GIC     2940611 :         } while (insertingat < upto);
                               1513                 : 
                               1514         5897456 :         if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
 3306 heikki.linnakangas       1515 CBC      435915 :             finishedUpto = insertingat;
                               1516                 :     }
 3562                          1517          737182 :     return finishedUpto;
 3562 heikki.linnakangas       1518 ECB             : }
                               1519                 : 
                               1520                 : /*
                               1521                 :  * Get a pointer to the right location in the WAL buffer containing the
                               1522                 :  * given XLogRecPtr.
                               1523                 :  *
                               1524                 :  * If the page is not initialized yet, it is initialized. That might require
                               1525                 :  * evicting an old dirty buffer from the buffer cache, which means I/O.
                               1526                 :  *
                               1527                 :  * The caller must ensure that the page containing the requested location
                               1528                 :  * isn't evicted yet, and won't be evicted. The way to ensure that is to
                               1529                 :  * hold onto a WAL insertion lock with the insertingAt position set to
                               1530                 :  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
                               1531                 :  * to evict an old page from the buffer. (This means that once you call
                               1532                 :  * GetXLogBuffer() with a given 'ptr', you must not access anything before
                               1533                 :  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
                               1534                 :  * later, because older buffers might be recycled already)
                               1535                 :  */
                               1536                 : static char *
  520 rhaas                    1537 GIC    20274329 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
                               1538                 : {
                               1539                 :     int         idx;
 3562 heikki.linnakangas       1540 ECB             :     XLogRecPtr  endptr;
                               1541                 :     static uint64 cachedPage = 0;
                               1542                 :     static char *cachedPos = NULL;
                               1543                 :     XLogRecPtr  expectedEndPtr;
                               1544                 : 
                               1545                 :     /*
                               1546                 :      * Fast path for the common case that we need to access again the same
                               1547                 :      * page as last time.
                               1548                 :      */
 3562 heikki.linnakangas       1549 GIC    20274329 :     if (ptr / XLOG_BLCKSZ == cachedPage)
                               1550                 :     {
                               1551        19285796 :         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
 3562 heikki.linnakangas       1552 CBC    19285796 :         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
 3562 heikki.linnakangas       1553 GIC    19285796 :         return cachedPos + ptr % XLOG_BLCKSZ;
 6997 tgl                      1554 ECB             :     }
                               1555                 : 
 3562 heikki.linnakangas       1556                 :     /*
                               1557                 :      * The XLog buffer cache is organized so that a page is always loaded to a
                               1558                 :      * particular buffer.  That way we can easily calculate the buffer a given
                               1559                 :      * page must be loaded into, from the XLogRecPtr alone.
                               1560                 :      */
 3562 heikki.linnakangas       1561 GIC      988533 :     idx = XLogRecPtrToBufIdx(ptr);
                               1562                 : 
                               1563                 :     /*
 3562 heikki.linnakangas       1564 ECB             :      * See what page is loaded in the buffer at the moment. It could be the
                               1565                 :      * page we're looking for, or something older. It can't be anything newer
                               1566                 :      * - that would imply the page we're looking for has already been written
                               1567                 :      * out to disk and evicted, and the caller is responsible for making sure
                               1568                 :      * that doesn't happen.
                               1569                 :      *
                               1570                 :      * However, we don't hold a lock while we read the value. If someone has
                               1571                 :      * just initialized the page, it's possible that we get a "torn read" of
                               1572                 :      * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
                               1573                 :      * that case we will see a bogus value. That's ok, we'll grab the mapping
                               1574                 :      * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
                               1575                 :      * the page we're looking for. But it means that when we do this unlocked
                               1576                 :      * read, we might see a value that appears to be ahead of the page we're
                               1577                 :      * looking for. Don't PANIC on that, until we've verified the value while
                               1578                 :      * holding the lock.
                               1579                 :      */
 3562 heikki.linnakangas       1580 GIC      988533 :     expectedEndPtr = ptr;
                               1581          988533 :     expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
                               1582                 : 
 3562 heikki.linnakangas       1583 CBC      988533 :     endptr = XLogCtl->xlblocks[idx];
                               1584          988533 :     if (expectedEndPtr != endptr)
                               1585                 :     {
 2807 heikki.linnakangas       1586 ECB             :         XLogRecPtr  initializedUpto;
                               1587                 : 
                               1588                 :         /*
                               1589                 :          * Before calling AdvanceXLInsertBuffer(), which can block, let others
                               1590                 :          * know how far we're finished with inserting the record.
                               1591                 :          *
                               1592                 :          * NB: If 'ptr' points to just after the page header, advertise a
                               1593                 :          * position at the beginning of the page rather than 'ptr' itself. If
                               1594                 :          * there are no other insertions running, someone might try to flush
                               1595                 :          * up to our advertised location. If we advertised a position after
                               1596                 :          * the page header, someone might try to flush the page header, even
                               1597                 :          * though page might actually not be initialized yet. As the first
                               1598                 :          * inserter on the page, we are effectively responsible for making
                               1599                 :          * sure that it's initialized, before we let insertingAt to move past
                               1600                 :          * the page header.
                               1601                 :          */
 2807 heikki.linnakangas       1602 GIC      742721 :         if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
 2028 andres                   1603           12172 :             XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
 2807 heikki.linnakangas       1604           12172 :             initializedUpto = ptr - SizeOfXLogShortPHD;
 2807 heikki.linnakangas       1605 CBC      730549 :         else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
 2028 andres                   1606             343 :                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
 2807 heikki.linnakangas       1607             173 :             initializedUpto = ptr - SizeOfXLogLongPHD;
 2807 heikki.linnakangas       1608 ECB             :         else
 2807 heikki.linnakangas       1609 CBC      730376 :             initializedUpto = ptr;
 2807 heikki.linnakangas       1610 ECB             : 
 2807 heikki.linnakangas       1611 GIC      742721 :         WALInsertLockUpdateInsertingAt(initializedUpto);
 6090 tgl                      1612 ECB             : 
  520 rhaas                    1613 GIC      742721 :         AdvanceXLInsertBuffer(ptr, tli, false);
 3562 heikki.linnakangas       1614 CBC      742721 :         endptr = XLogCtl->xlblocks[idx];
                               1615                 : 
                               1616          742721 :         if (expectedEndPtr != endptr)
 3562 heikki.linnakangas       1617 LBC           0 :             elog(PANIC, "could not find WAL buffer for %X/%X",
                               1618                 :                  LSN_FORMAT_ARGS(ptr));
 3562 heikki.linnakangas       1619 ECB             :     }
 3562 heikki.linnakangas       1620 EUB             :     else
                               1621                 :     {
                               1622                 :         /*
                               1623                 :          * Make sure the initialization of the page is visible to us, and
                               1624                 :          * won't arrive later to overwrite the WAL data we write on the page.
                               1625                 :          */
 3562 heikki.linnakangas       1626 GIC      245812 :         pg_memory_barrier();
                               1627                 :     }
                               1628                 : 
 3562 heikki.linnakangas       1629 ECB             :     /*
                               1630                 :      * Found the buffer holding this page. Return a pointer to the right
                               1631                 :      * offset within the page.
                               1632                 :      */
 3562 heikki.linnakangas       1633 GIC      988533 :     cachedPage = ptr / XLOG_BLCKSZ;
                               1634          988533 :     cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
                               1635                 : 
 3562 heikki.linnakangas       1636 CBC      988533 :     Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
                               1637          988533 :     Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
                               1638                 : 
                               1639          988533 :     return cachedPos + ptr % XLOG_BLCKSZ;
 3562 heikki.linnakangas       1640 ECB             : }
                               1641                 : 
                               1642                 : /*
                               1643                 :  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
                               1644                 :  * is the position starting from the beginning of WAL, excluding all WAL
                               1645                 :  * page headers.
                               1646                 :  */
                               1647                 : static XLogRecPtr
 3562 heikki.linnakangas       1648 GIC    38806008 : XLogBytePosToRecPtr(uint64 bytepos)
                               1649                 : {
                               1650                 :     uint64      fullsegs;
 3562 heikki.linnakangas       1651 ECB             :     uint64      fullpages;
                               1652                 :     uint64      bytesleft;
                               1653                 :     uint32      seg_offset;
                               1654                 :     XLogRecPtr  result;
                               1655                 : 
 3562 heikki.linnakangas       1656 GIC    38806008 :     fullsegs = bytepos / UsableBytesInSegment;
                               1657        38806008 :     bytesleft = bytepos % UsableBytesInSegment;
                               1658                 : 
 3562 heikki.linnakangas       1659 CBC    38806008 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
 8205 vadim4o                  1660 ECB             :     {
                               1661                 :         /* fits on first page of segment */
 3562 heikki.linnakangas       1662 CBC       73916 :         seg_offset = bytesleft + SizeOfXLogLongPHD;
                               1663                 :     }
                               1664                 :     else
 8595 vadim4o                  1665 ECB             :     {
                               1666                 :         /* account for the first page on segment with long header */
 3562 heikki.linnakangas       1667 GIC    38732092 :         seg_offset = XLOG_BLCKSZ;
                               1668        38732092 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
                               1669                 : 
 3562 heikki.linnakangas       1670 CBC    38732092 :         fullpages = bytesleft / UsableBytesInPage;
                               1671        38732092 :         bytesleft = bytesleft % UsableBytesInPage;
                               1672                 : 
                               1673        38732092 :         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
 8595 vadim4o                  1674 ECB             :     }
                               1675                 : 
 1735 alvherre                 1676 CBC    38806008 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
                               1677                 : 
 3562 heikki.linnakangas       1678 GIC    38806008 :     return result;
 3562 heikki.linnakangas       1679 ECB             : }
                               1680                 : 
                               1681                 : /*
                               1682                 :  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
                               1683                 :  * returns a pointer to the beginning of the page (ie. before page header),
                               1684                 :  * not to where the first xlog record on that page would go to. This is used
                               1685                 :  * when converting a pointer to the end of a record.
                               1686                 :  */
                               1687                 : static XLogRecPtr
 3562 heikki.linnakangas       1688 GIC    20136832 : XLogBytePosToEndRecPtr(uint64 bytepos)
                               1689                 : {
                               1690                 :     uint64      fullsegs;
 3562 heikki.linnakangas       1691 ECB             :     uint64      fullpages;
                               1692                 :     uint64      bytesleft;
                               1693                 :     uint32      seg_offset;
                               1694                 :     XLogRecPtr  result;
                               1695                 : 
 3562 heikki.linnakangas       1696 GIC    20136832 :     fullsegs = bytepos / UsableBytesInSegment;
                               1697        20136832 :     bytesleft = bytepos % UsableBytesInSegment;
                               1698                 : 
 3562 heikki.linnakangas       1699 CBC    20136832 :     if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
 3562 heikki.linnakangas       1700 ECB             :     {
                               1701                 :         /* fits on first page of segment */
 3562 heikki.linnakangas       1702 CBC      394401 :         if (bytesleft == 0)
 3562 heikki.linnakangas       1703 GIC      357373 :             seg_offset = 0;
                               1704                 :         else
 3562 heikki.linnakangas       1705 CBC       37028 :             seg_offset = bytesleft + SizeOfXLogLongPHD;
 3562 heikki.linnakangas       1706 ECB             :     }
                               1707                 :     else
 6090 tgl                      1708                 :     {
                               1709                 :         /* account for the first page on segment with long header */
 3562 heikki.linnakangas       1710 GIC    19742431 :         seg_offset = XLOG_BLCKSZ;
                               1711        19742431 :         bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
                               1712                 : 
 3562 heikki.linnakangas       1713 CBC    19742431 :         fullpages = bytesleft / UsableBytesInPage;
                               1714        19742431 :         bytesleft = bytesleft % UsableBytesInPage;
                               1715                 : 
                               1716        19742431 :         if (bytesleft == 0)
                               1717           20224 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
                               1718                 :         else
                               1719        19722207 :             seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
 3562 heikki.linnakangas       1720 ECB             :     }
                               1721                 : 
 1735 alvherre                 1722 CBC    20136832 :     XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
                               1723                 : 
 3562 heikki.linnakangas       1724 GIC    20136832 :     return result;
 3562 heikki.linnakangas       1725 ECB             : }
                               1726                 : 
                               1727                 : /*
                               1728                 :  * Convert an XLogRecPtr to a "usable byte position".
                               1729                 :  */
                               1730                 : static uint64
 3562 heikki.linnakangas       1731 GIC    58200582 : XLogRecPtrToBytePos(XLogRecPtr ptr)
                               1732                 : {
                               1733                 :     uint64      fullsegs;
 3562 heikki.linnakangas       1734 ECB             :     uint32      fullpages;
                               1735                 :     uint32      offset;
                               1736                 :     uint64      result;
                               1737                 : 
 2028 andres                   1738 GIC    58200582 :     XLByteToSeg(ptr, fullsegs, wal_segment_size);
                               1739                 : 
                               1740        58200582 :     fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
 3562 heikki.linnakangas       1741 CBC    58200582 :     offset = ptr % XLOG_BLCKSZ;
                               1742                 : 
                               1743        58200582 :     if (fullpages == 0)
 3562 heikki.linnakangas       1744 ECB             :     {
 3562 heikki.linnakangas       1745 GIC      111171 :         result = fullsegs * UsableBytesInSegment;
 3562 heikki.linnakangas       1746 CBC      111171 :         if (offset > 0)
                               1747                 :         {
                               1748          110641 :             Assert(offset >= SizeOfXLogLongPHD);
                               1749          110641 :             result += offset - SizeOfXLogLongPHD;
                               1750                 :         }
 6090 tgl                      1751 ECB             :     }
 8595 vadim4o                  1752                 :     else
                               1753                 :     {
 3562 heikki.linnakangas       1754 GIC    58089411 :         result = fullsegs * UsableBytesInSegment +
 3260 bruce                    1755        58089411 :             (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
 2118 tgl                      1756        58089411 :             (fullpages - 1) * UsableBytesInPage;    /* full pages */
 3562 heikki.linnakangas       1757 CBC    58089411 :         if (offset > 0)
 6090 tgl                      1758 ECB             :         {
 3562 heikki.linnakangas       1759 CBC    58069581 :             Assert(offset >= SizeOfXLogShortPHD);
                               1760        58069581 :             result += offset - SizeOfXLogShortPHD;
                               1761                 :         }
 8595 vadim4o                  1762 ECB             :     }
                               1763                 : 
 3562 heikki.linnakangas       1764 GIC    58200582 :     return result;
                               1765                 : }
                               1766                 : 
 8062 tgl                      1767 ECB             : /*
                               1768                 :  * Initialize XLOG buffers, writing out old buffers if they still contain
                               1769                 :  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
                               1770                 :  * true, initialize as many pages as we can without having to write out
                               1771                 :  * unwritten data. Any new pages are initialized to zeros, with pages headers
                               1772                 :  * initialized properly.
                               1773                 :  */
                               1774                 : static void
  520 rhaas                    1775 GIC      749930 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
                               1776                 : {
 8062 tgl                      1777          749930 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
 3562 heikki.linnakangas       1778 ECB             :     int         nextidx;
                               1779                 :     XLogRecPtr  OldPageRqstPtr;
 8062 tgl                      1780                 :     XLogwrtRqst WriteRqst;
 3562 heikki.linnakangas       1781 GIC      749930 :     XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
                               1782                 :     XLogRecPtr  NewPageBeginPtr;
                               1783                 :     XLogPageHeader NewPage;
  201 tgl                      1784 CBC      749930 :     int         npages pg_attribute_unused() = 0;
                               1785                 : 
 3562 heikki.linnakangas       1786 GIC      749930 :     LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
 8595 vadim4o                  1787 ECB             : 
                               1788                 :     /*
 3562 heikki.linnakangas       1789                 :      * Now that we have the lock, check if someone initialized the page
                               1790                 :      * already.
                               1791                 :      */
 3553 heikki.linnakangas       1792 GIC     2102859 :     while (upto >= XLogCtl->InitializedUpTo || opportunistic)
                               1793                 :     {
                               1794         1360138 :         nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
 7862 tgl                      1795 ECB             : 
                               1796                 :         /*
 3562 heikki.linnakangas       1797                 :          * Get ending-offset of the buffer page we need to replace (this may
                               1798                 :          * be zero if the buffer hasn't been used yet).  Fall through if it's
                               1799                 :          * already written out.
                               1800                 :          */
 3562 heikki.linnakangas       1801 GIC     1360138 :         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
 3754 alvherre                 1802         1360138 :         if (LogwrtResult.Write < OldPageRqstPtr)
                               1803                 :         {
 3562 heikki.linnakangas       1804 ECB             :             /*
                               1805                 :              * Nope, got work to do. If we just want to pre-initialize as much
                               1806                 :              * as we can without flushing, give up now.
                               1807                 :              */
 3562 heikki.linnakangas       1808 GIC      442792 :             if (opportunistic)
                               1809            7209 :                 break;
                               1810                 : 
 3562 heikki.linnakangas       1811 ECB             :             /* Before waiting, get info_lck and update LogwrtResult */
 3121 andres                   1812 CBC      435583 :             SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   1813 GIC      435583 :             if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
                               1814          357826 :                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
 3121 andres                   1815 CBC      435583 :             LogwrtResult = XLogCtl->LogwrtResult;
                               1816          435583 :             SpinLockRelease(&XLogCtl->info_lck);
 3562 heikki.linnakangas       1817 ECB             : 
                               1818                 :             /*
                               1819                 :              * Now that we have an up-to-date LogwrtResult value, see if we
                               1820                 :              * still need to write it or if someone else already did.
                               1821                 :              */
 3562 heikki.linnakangas       1822 GIC      435583 :             if (LogwrtResult.Write < OldPageRqstPtr)
                               1823                 :             {
                               1824                 :                 /*
 3562 heikki.linnakangas       1825 ECB             :                  * Must acquire write lock. Release WALBufMappingLock first,
                               1826                 :                  * to make sure that all insertions that we need to wait for
                               1827                 :                  * can finish (up to this same position). Otherwise we risk
                               1828                 :                  * deadlock.
                               1829                 :                  */
 3562 heikki.linnakangas       1830 GIC      433903 :                 LWLockRelease(WALBufMappingLock);
                               1831                 : 
                               1832          433903 :                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
 3562 heikki.linnakangas       1833 ECB             : 
 3562 heikki.linnakangas       1834 GIC      433903 :                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 3562 heikki.linnakangas       1835 ECB             : 
 3562 heikki.linnakangas       1836 GIC      433903 :                 LogwrtResult = XLogCtl->LogwrtResult;
 3562 heikki.linnakangas       1837 CBC      433903 :                 if (LogwrtResult.Write >= OldPageRqstPtr)
                               1838                 :                 {
 3562 heikki.linnakangas       1839 ECB             :                     /* OK, someone wrote it already */
 3562 heikki.linnakangas       1840 CBC        1477 :                     LWLockRelease(WALWriteLock);
                               1841                 :                 }
                               1842                 :                 else
 3562 heikki.linnakangas       1843 ECB             :                 {
                               1844                 :                     /* Have to write it ourselves */
                               1845                 :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
 3562 heikki.linnakangas       1846 GIC      432426 :                     WriteRqst.Write = OldPageRqstPtr;
                               1847          432426 :                     WriteRqst.Flush = 0;
  520 rhaas                    1848          432426 :                     XLogWrite(WriteRqst, tli, false);
 3562 heikki.linnakangas       1849 CBC      432426 :                     LWLockRelease(WALWriteLock);
  368 andres                   1850          432426 :                     PendingWalStats.wal_buffers_full++;
 3562 heikki.linnakangas       1851 ECB             :                     TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
                               1852                 :                 }
                               1853                 :                 /* Re-acquire WALBufMappingLock and retry */
 3562 heikki.linnakangas       1854 GIC      433903 :                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
                               1855          433903 :                 continue;
                               1856                 :             }
 8595 vadim4o                  1857 ECB             :         }
                               1858                 : 
                               1859                 :         /*
                               1860                 :          * Now the next buffer slot is free and we can set it up to be the
                               1861                 :          * next output page.
                               1862                 :          */
 3553 heikki.linnakangas       1863 GIC      919026 :         NewPageBeginPtr = XLogCtl->InitializedUpTo;
 3562                          1864          919026 :         NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
                               1865                 : 
 3562 heikki.linnakangas       1866 CBC      919026 :         Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
 6090 tgl                      1867 ECB             : 
 3562 heikki.linnakangas       1868 GIC      919026 :         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
 6385 bruce                    1869 ECB             : 
                               1870                 :         /*
 3562 heikki.linnakangas       1871                 :          * Be sure to re-zero the buffer so that bytes beyond what we've
                               1872                 :          * written will look like zeroes and not valid XLOG records...
                               1873                 :          */
 3562 heikki.linnakangas       1874 GIC      919026 :         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
                               1875                 : 
                               1876                 :         /*
 3562 heikki.linnakangas       1877 ECB             :          * Fill the new page's header
                               1878                 :          */
 2878 bruce                    1879 GIC      919026 :         NewPage->xlp_magic = XLOG_PAGE_MAGIC;
                               1880                 : 
                               1881                 :         /* NewPage->xlp_info = 0; */ /* done by memset */
  520 rhaas                    1882 CBC      919026 :         NewPage->xlp_tli = tli;
 2878 bruce                    1883 GIC      919026 :         NewPage->xlp_pageaddr = NewPageBeginPtr;
                               1884                 : 
 3260 bruce                    1885 ECB             :         /* NewPage->xlp_rem_len = 0; */  /* done by memset */
 7934 tgl                      1886                 : 
                               1887                 :         /*
                               1888                 :          * If online backup is not in progress, mark the header to indicate
                               1889                 :          * that WAL records beginning in this page have removable backup
                               1890                 :          * blocks.  This allows the WAL archiver to know whether it is safe to
                               1891                 :          * compress archived WAL data by transforming full-block records into
                               1892                 :          * the non-full-block format.  It is sufficient to record this at the
                               1893                 :          * page level because we force a page switch (in fact a segment
                               1894                 :          * switch) when starting a backup, so the flag will be off before any
                               1895                 :          * records can be written during the backup.  At the end of a backup,
                               1896                 :          * the last page will be marked as all unsafe when perhaps only part
                               1897                 :          * is unsafe, but at worst the archiver would miss the opportunity to
                               1898                 :          * compress a few records.
                               1899                 :          */
  172 alvherre                 1900 GNC      919026 :         if (Insert->runningBackups == 0)
 2878 bruce                    1901 GIC      810359 :             NewPage->xlp_info |= XLP_BKP_REMOVABLE;
                               1902                 : 
 3562 heikki.linnakangas       1903 ECB             :         /*
                               1904                 :          * If first page of an XLOG segment file, make it a long header.
                               1905                 :          */
 2028 andres                   1906 GIC      919026 :         if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
                               1907                 :         {
 3562 heikki.linnakangas       1908             613 :             XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
 8062 tgl                      1909 ECB             : 
 3562 heikki.linnakangas       1910 GIC         613 :             NewLongPage->xlp_sysid = ControlFile->system_identifier;
 2028 andres                   1911 CBC         613 :             NewLongPage->xlp_seg_size = wal_segment_size;
 3562 heikki.linnakangas       1912 GIC         613 :             NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
 2878 bruce                    1913 CBC         613 :             NewPage->xlp_info |= XLP_LONG_HEADER;
 3562 heikki.linnakangas       1914 ECB             :         }
 4136 tgl                      1915                 : 
 3562 heikki.linnakangas       1916                 :         /*
                               1917                 :          * Make sure the initialization of the page becomes visible to others
                               1918                 :          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
                               1919                 :          * holding a lock.
                               1920                 :          */
 3562 heikki.linnakangas       1921 GIC      919026 :         pg_write_barrier();
                               1922                 : 
                               1923          919026 :         *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
 6997 tgl                      1924 ECB             : 
 3553 heikki.linnakangas       1925 GIC      919026 :         XLogCtl->InitializedUpTo = NewPageEndPtr;
 6385 bruce                    1926 ECB             : 
 3562 heikki.linnakangas       1927 GIC      919026 :         npages++;
 6997 tgl                      1928 ECB             :     }
 3562 heikki.linnakangas       1929 GIC      749930 :     LWLockRelease(WALBufMappingLock);
 6997 tgl                      1930 ECB             : 
                               1931                 : #ifdef WAL_DEBUG
 2849 andres                   1932                 :     if (XLOG_DEBUG && npages > 0)
                               1933                 :     {
                               1934                 :         elog(DEBUG1, "initialized %d pages, up to %X/%X",
                               1935                 :              npages, LSN_FORMAT_ARGS(NewPageEndPtr));
                               1936                 :     }
                               1937                 : #endif
 8595 vadim4o                  1938 GIC      749930 : }
                               1939                 : 
                               1940                 : /*
 2196 simon                    1941 ECB             :  * Calculate CheckPointSegments based on max_wal_size_mb and
                               1942                 :  * checkpoint_completion_target.
                               1943                 :  */
                               1944                 : static void
 2967 heikki.linnakangas       1945 GIC        9091 : CalculateCheckpointSegments(void)
                               1946                 : {
                               1947                 :     double      target;
 2967 heikki.linnakangas       1948 ECB             : 
                               1949                 :     /*-------
                               1950                 :      * Calculate the distance at which to trigger a checkpoint, to avoid
                               1951                 :      * exceeding max_wal_size_mb. This is based on two assumptions:
                               1952                 :      *
                               1953                 :      * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
                               1954                 :      *    WAL for two checkpoint cycles to allow us to recover from the
                               1955                 :      *    secondary checkpoint if the first checkpoint failed, though we
                               1956                 :      *    only did this on the primary anyway, not on standby. Keeping just
                               1957                 :      *    one checkpoint simplifies processing and reduces disk space in
                               1958                 :      *    many smaller databases.)
                               1959                 :      * b) during checkpoint, we consume checkpoint_completion_target *
                               1960                 :      *    number of segments consumed between checkpoints.
                               1961                 :      *-------
                               1962                 :      */
 2028 andres                   1963 GIC        9091 :     target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
 1979 simon                    1964            9091 :         (1.0 + CheckPointCompletionTarget);
                               1965                 : 
 2967 heikki.linnakangas       1966 ECB             :     /* round down */
 2967 heikki.linnakangas       1967 CBC        9091 :     CheckPointSegments = (int) target;
                               1968                 : 
 2967 heikki.linnakangas       1969 GIC        9091 :     if (CheckPointSegments < 1)
 2967 heikki.linnakangas       1970 CBC           9 :         CheckPointSegments = 1;
 2967 heikki.linnakangas       1971 GIC        9091 : }
 2967 heikki.linnakangas       1972 ECB             : 
                               1973                 : void
 2967 heikki.linnakangas       1974 CBC        6012 : assign_max_wal_size(int newval, void *extra)
                               1975                 : {
 2196 simon                    1976 GIC        6012 :     max_wal_size_mb = newval;
 2967 heikki.linnakangas       1977 CBC        6012 :     CalculateCheckpointSegments();
 2967 heikki.linnakangas       1978 GIC        6012 : }
 2967 heikki.linnakangas       1979 ECB             : 
                               1980                 : void
 2967 heikki.linnakangas       1981 CBC        1857 : assign_checkpoint_completion_target(double newval, void *extra)
                               1982                 : {
 2967 heikki.linnakangas       1983 GIC        1857 :     CheckPointCompletionTarget = newval;
 2967 heikki.linnakangas       1984 CBC        1857 :     CalculateCheckpointSegments();
 2967 heikki.linnakangas       1985 GIC        1857 : }
 2967 heikki.linnakangas       1986 ECB             : 
                               1987                 : /*
                               1988                 :  * At a checkpoint, how many WAL segments to recycle as preallocated future
                               1989                 :  * XLOG segments? Returns the highest segment that should be preallocated.
                               1990                 :  */
                               1991                 : static XLogSegNo
 1208 michael                  1992 GIC        2363 : XLOGfileslop(XLogRecPtr lastredoptr)
                               1993                 : {
                               1994                 :     XLogSegNo   minSegNo;
 2967 heikki.linnakangas       1995 ECB             :     XLogSegNo   maxSegNo;
                               1996                 :     double      distance;
                               1997                 :     XLogSegNo   recycleSegNo;
                               1998                 : 
                               1999                 :     /*
                               2000                 :      * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
                               2001                 :      * correspond to. Always recycle enough segments to meet the minimum, and
                               2002                 :      * remove enough segments to stay below the maximum.
                               2003                 :      */
 1208 michael                  2004 GIC        2363 :     minSegNo = lastredoptr / wal_segment_size +
 2028 andres                   2005            2363 :         ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
 1208 michael                  2006            2363 :     maxSegNo = lastredoptr / wal_segment_size +
 2028 andres                   2007 CBC        2363 :         ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
 2967 heikki.linnakangas       2008 ECB             : 
                               2009                 :     /*
                               2010                 :      * Between those limits, recycle enough segments to get us through to the
                               2011                 :      * estimated end of next checkpoint.
                               2012                 :      *
                               2013                 :      * To estimate where the next checkpoint will finish, assume that the
                               2014                 :      * system runs steadily consuming CheckPointDistanceEstimate bytes between
                               2015                 :      * every checkpoint.
                               2016                 :      */
 1979 simon                    2017 GIC        2363 :     distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
                               2018                 :     /* add 10% for good measure. */
 2967 heikki.linnakangas       2019            2363 :     distance *= 1.10;
 2967 heikki.linnakangas       2020 ECB             : 
 1208 michael                  2021 GIC        2363 :     recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
 2028 andres                   2022 ECB             :                                     wal_segment_size);
                               2023                 : 
 2967 heikki.linnakangas       2024 CBC        2363 :     if (recycleSegNo < minSegNo)
 2967 heikki.linnakangas       2025 GIC        2219 :         recycleSegNo = minSegNo;
                               2026            2363 :     if (recycleSegNo > maxSegNo)
 2967 heikki.linnakangas       2027 CBC          55 :         recycleSegNo = maxSegNo;
 2967 heikki.linnakangas       2028 ECB             : 
 2967 heikki.linnakangas       2029 CBC        2363 :     return recycleSegNo;
 2967 heikki.linnakangas       2030 ECB             : }
                               2031                 : 
 5658 tgl                      2032                 : /*
                               2033                 :  * Check whether we've consumed enough xlog space that a checkpoint is needed.
                               2034                 :  *
                               2035                 :  * new_segno indicates a log file that has just been filled up (or read
                               2036                 :  * during recovery). We measure the distance from RedoRecPtr to new_segno
                               2037                 :  * and see if that exceeds CheckPointSegments.
                               2038                 :  *
                               2039                 :  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
                               2040                 :  */
                               2041                 : bool
 3941 heikki.linnakangas       2042 GIC         865 : XLogCheckpointNeeded(XLogSegNo new_segno)
                               2043                 : {
                               2044                 :     XLogSegNo   old_segno;
 3941 heikki.linnakangas       2045 ECB             : 
 2028 andres                   2046 GIC         865 :     XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
                               2047                 : 
 3941 heikki.linnakangas       2048             865 :     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
 5658 tgl                      2049 CBC         147 :         return true;
 5658 tgl                      2050 GIC         718 :     return false;
 5658 tgl                      2051 ECB             : }
                               2052                 : 
 8062                          2053                 : /*
                               2054                 :  * Write and/or fsync the log at least as far as WriteRqst indicates.
                               2055                 :  *
                               2056                 :  * If flexible == true, we don't have to write as far as WriteRqst, but
                               2057                 :  * may stop at any convenient boundary (such as a cache or logfile boundary).
                               2058                 :  * This option allows us to avoid uselessly issuing multiple writes when a
                               2059                 :  * single one would do.
                               2060                 :  *
                               2061                 :  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
                               2062                 :  * must be called before grabbing the lock, to make sure the data is ready to
                               2063                 :  * write.
                               2064                 :  */
                               2065                 : static void
  520 rhaas                    2066 GIC      731996 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
                               2067                 : {
                               2068                 :     bool        ispartialpage;
 6090 tgl                      2069 ECB             :     bool        last_iteration;
                               2070                 :     bool        finishing_seg;
                               2071                 :     int         curridx;
                               2072                 :     int         npages;
                               2073                 :     int         startidx;
                               2074                 :     uint32      startoffset;
                               2075                 : 
                               2076                 :     /* We should always be inside a critical section here */
 6568 tgl                      2077 GIC      731996 :     Assert(CritSectionCount > 0);
                               2078                 : 
                               2079                 :     /*
 6385 bruce                    2080 ECB             :      * Update local LogwrtResult (caller probably did this already, but...)
                               2081                 :      */
 4051 heikki.linnakangas       2082 GIC      731996 :     LogwrtResult = XLogCtl->LogwrtResult;
                               2083                 : 
                               2084                 :     /*
 6439 tgl                      2085 ECB             :      * Since successive pages in the xlog cache are consecutively allocated,
                               2086                 :      * we can usually gather multiple pages together and issue just one
                               2087                 :      * write() call.  npages is the number of pages we have determined can be
                               2088                 :      * written together; startidx is the cache block index of the first one,
                               2089                 :      * and startoffset is the file offset at which it should go. The latter
                               2090                 :      * two variables are only valid when npages > 0, but we must initialize
                               2091                 :      * all of them to keep the compiler quiet.
                               2092                 :      */
 6439 tgl                      2093 GIC      731996 :     npages = 0;
                               2094          731996 :     startidx = 0;
                               2095          731996 :     startoffset = 0;
 6439 tgl                      2096 ECB             : 
                               2097                 :     /*
                               2098                 :      * Within the loop, curridx is the cache block index of the page to
                               2099                 :      * consider writing.  Begin at the buffer containing the next unwritten
                               2100                 :      * page, or last partially written page.
                               2101                 :      */
 3553 heikki.linnakangas       2102 GIC      731996 :     curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
                               2103                 : 
 3754 alvherre                 2104         1617944 :     while (LogwrtResult.Write < WriteRqst.Write)
 8595 vadim4o                  2105 ECB             :     {
                               2106                 :         /*
 6385 bruce                    2107                 :          * Make sure we're not ahead of the insert process.  This could happen
                               2108                 :          * if we're passed a bogus WriteRqst.Write that is past the end of the
                               2109                 :          * last page that's been initialized by AdvanceXLInsertBuffer.
                               2110                 :          */
 3260 bruce                    2111 GIC     1184781 :         XLogRecPtr  EndPtr = XLogCtl->xlblocks[curridx];
                               2112                 : 
 3562 heikki.linnakangas       2113         1184781 :         if (LogwrtResult.Write >= EndPtr)
 7202 tgl                      2114 LBC           0 :             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
                               2115                 :                  LSN_FORMAT_ARGS(LogwrtResult.Write),
  775 peter                    2116 ECB             :                  LSN_FORMAT_ARGS(EndPtr));
 8059 tgl                      2117 EUB             : 
                               2118                 :         /* Advance LogwrtResult.Write to end of current buffer page */
 3562 heikki.linnakangas       2119 GIC     1184781 :         LogwrtResult.Write = EndPtr;
 3754 alvherre                 2120         1184781 :         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
                               2121                 : 
 2028 andres                   2122 CBC     1184781 :         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
 2028 andres                   2123 ECB             :                              wal_segment_size))
                               2124                 :         {
 8062 tgl                      2125                 :             /*
                               2126                 :              * Switch to new logfile segment.  We cannot have any pending
                               2127                 :              * pages here (since we dump what we have at segment end).
                               2128                 :              */
 6439 tgl                      2129 GIC        7172 :             Assert(npages == 0);
 8062                          2130            7172 :             if (openLogFile >= 0)
 6142 bruce                    2131            1626 :                 XLogFileClose();
 2028 andres                   2132 CBC        7172 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 2028 andres                   2133 ECB             :                             wal_segment_size);
  520 rhaas                    2134 CBC        7172 :             openLogTLI = tli;
 8062 tgl                      2135 ECB             : 
                               2136                 :             /* create/use new log file */
  520 rhaas                    2137 CBC        7172 :             openLogFile = XLogFileInit(openLogSegNo, tli);
 1140 tgl                      2138 GIC        7172 :             ReserveExternalFD();
                               2139                 :         }
 8595 vadim4o                  2140 ECB             : 
 6439 tgl                      2141                 :         /* Make sure we have the current logfile open */
 8062 tgl                      2142 GIC     1184781 :         if (openLogFile < 0)
                               2143                 :         {
 2028 andres                   2144 UIC           0 :             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 2028 andres                   2145 ECB             :                             wal_segment_size);
  520 rhaas                    2146 UIC           0 :             openLogTLI = tli;
  520 rhaas                    2147 UBC           0 :             openLogFile = XLogFileOpen(openLogSegNo, tli);
 1140 tgl                      2148 UIC           0 :             ReserveExternalFD();
 8595 vadim4o                  2149 EUB             :         }
                               2150                 : 
 6439 tgl                      2151                 :         /* Add current page to the set of pending pages-to-dump */
 6439 tgl                      2152 GIC     1184781 :         if (npages == 0)
                               2153                 :         {
                               2154                 :             /* first of group */
 6439 tgl                      2155 CBC      736563 :             startidx = curridx;
 2028 andres                   2156 GIC      736563 :             startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
                               2157                 :                                             wal_segment_size);
 6439 tgl                      2158 ECB             :         }
 6439 tgl                      2159 CBC     1184781 :         npages++;
                               2160                 : 
                               2161                 :         /*
 6385 bruce                    2162 ECB             :          * Dump the set if this will be the last loop iteration, or if we are
                               2163                 :          * at the last page of the cache area (since the next page won't be
                               2164                 :          * contiguous in memory), or if we are at the end of the logfile
                               2165                 :          * segment.
                               2166                 :          */
 3754 alvherre                 2167 GIC     1184781 :         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
                               2168                 : 
 6439 tgl                      2169         2075101 :         finishing_seg = !ispartialpage &&
 2028 andres                   2170 CBC      890320 :             (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
                               2171                 : 
 6090 tgl                      2172         1184781 :         if (last_iteration ||
 6439                          2173          453199 :             curridx == XLogCtl->XLogCacheBlck ||
                               2174                 :             finishing_seg)
 8062 tgl                      2175 ECB             :         {
 6439                          2176                 :             char       *from;
                               2177                 :             Size        nbytes;
                               2178                 :             Size        nleft;
                               2179                 :             int         written;
                               2180                 :             instr_time  start;
                               2181                 : 
                               2182                 :             /* OK to write the page(s) */
 6215 tgl                      2183 GIC      736563 :             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
                               2184          736563 :             nbytes = npages * (Size) XLOG_BLCKSZ;
 3569 heikki.linnakangas       2185          736563 :             nleft = nbytes;
 3569 heikki.linnakangas       2186 ECB             :             do
 6439 tgl                      2187                 :             {
 3569 heikki.linnakangas       2188 CBC      736563 :                 errno = 0;
                               2189                 : 
                               2190                 :                 /* Measure I/O timing to write WAL data */
  761 fujii                    2191          736563 :                 if (track_wal_io_timing)
  761 fujii                    2192 UIC           0 :                     INSTR_TIME_SET_CURRENT(start);
                               2193                 :                 else
   79 andres                   2194 GNC      736563 :                     INSTR_TIME_SET_ZERO(start);
                               2195                 : 
 2213 rhaas                    2196 CBC      736563 :                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
  192 tmunro                   2197 GBC      736563 :                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
 2213 rhaas                    2198 GIC      736563 :                 pgstat_report_wait_end();
  761 fujii                    2199 ECB             : 
                               2200                 :                 /*
                               2201                 :                  * Increment the I/O timing and the number of times WAL data
                               2202                 :                  * were written out to disk.
                               2203                 :                  */
  761 fujii                    2204 GIC      736563 :                 if (track_wal_io_timing)
                               2205                 :                 {
                               2206                 :                     instr_time  duration;
                               2207                 : 
  761 fujii                    2208 UIC           0 :                     INSTR_TIME_SET_CURRENT(duration);
   10 andres                   2209 UNC           0 :                     INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, duration, start);
                               2210                 :                 }
                               2211                 : 
  368 andres                   2212 GBC      736563 :                 PendingWalStats.wal_write++;
  761 fujii                    2213 EUB             : 
 3569 heikki.linnakangas       2214 GIC      736563 :                 if (written <= 0)
                               2215                 :                 {
 1223 michael                  2216 ECB             :                     char        xlogfname[MAXFNAMELEN];
                               2217                 :                     int         save_errno;
                               2218                 : 
 3569 heikki.linnakangas       2219 UIC           0 :                     if (errno == EINTR)
                               2220               0 :                         continue;
                               2221                 : 
 1223 michael                  2222               0 :                     save_errno = errno;
  520 rhaas                    2223 UBC           0 :                     XLogFileName(xlogfname, tli, openLogSegNo,
 1223 michael                  2224 EUB             :                                  wal_segment_size);
 1223 michael                  2225 UIC           0 :                     errno = save_errno;
 3569 heikki.linnakangas       2226 UBC           0 :                     ereport(PANIC,
 3569 heikki.linnakangas       2227 EUB             :                             (errcode_for_file_access(),
                               2228                 :                              errmsg("could not write to log file %s "
 3363 tgl                      2229                 :                                     "at offset %u, length %zu: %m",
 1223 michael                  2230                 :                                     xlogfname, startoffset, nleft)));
                               2231                 :                 }
 3569 heikki.linnakangas       2232 GIC      736563 :                 nleft -= written;
                               2233          736563 :                 from += written;
 1614 tmunro                   2234          736563 :                 startoffset += written;
 3569 heikki.linnakangas       2235          736563 :             } while (nleft > 0);
 6439 tgl                      2236 ECB             : 
 6439 tgl                      2237 CBC      736563 :             npages = 0;
 6439 tgl                      2238 ECB             : 
                               2239                 :             /*
                               2240                 :              * If we just wrote the whole last page of a logfile segment,
                               2241                 :              * fsync the segment immediately.  This avoids having to go back
                               2242                 :              * and re-open prior segments when an fsync request comes along
                               2243                 :              * later. Doing it here ensures that one and only one backend will
                               2244                 :              * perform this fsync.
                               2245                 :              *
                               2246                 :              * This is also the right place to notify the Archiver that the
                               2247                 :              * segment is ready to copy to archival storage, and to update the
                               2248                 :              * timer for archive_timeout, and to signal for a checkpoint if
                               2249                 :              * too many logfile segments have been used since the last
                               2250                 :              * checkpoint.
                               2251                 :              */
 3562 heikki.linnakangas       2252 GIC      736563 :             if (finishing_seg)
                               2253                 :             {
  520 rhaas                    2254             692 :                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
                               2255                 : 
 3933 rhaas                    2256 ECB             :                 /* signal that we need to wakeup walsenders later */
 3933 rhaas                    2257 GIC         692 :                 WalSndWakeupRequest();
 3933 rhaas                    2258 ECB             : 
 2118 tgl                      2259 GIC         692 :                 LogwrtResult.Flush = LogwrtResult.Write;    /* end of page */
                               2260                 : 
 6439 tgl                      2261 CBC         692 :                 if (XLogArchivingActive())
  520 rhaas                    2262 GIC          33 :                     XLogArchiveNotifySeg(openLogSegNo, tli);
 6079 tgl                      2263 ECB             : 
 3553 heikki.linnakangas       2264 GIC         692 :                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
 2299 andres                   2265 CBC         692 :                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
 5966 tgl                      2266 ECB             : 
                               2267                 :                 /*
 3955 bruce                    2268                 :                  * Request a checkpoint if we've consumed too much xlog since
                               2269                 :                  * the last one.  For speed, we first check using the local
                               2270                 :                  * copy of RedoRecPtr, which might be out of date; if it looks
                               2271                 :                  * like a checkpoint is needed, forcibly update RedoRecPtr and
                               2272                 :                  * recheck.
                               2273                 :                  */
 3941 heikki.linnakangas       2274 GIC         692 :                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
                               2275                 :                 {
 5658 tgl                      2276              40 :                     (void) GetRedoRecPtr();
 3941 heikki.linnakangas       2277              40 :                     if (XLogCheckpointNeeded(openLogSegNo))
 5762 tgl                      2278 CBC          30 :                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
                               2279                 :                 }
 6439 tgl                      2280 ECB             :             }
 8062                          2281                 :         }
 8595 vadim4o                  2282                 : 
 8062 tgl                      2283 GIC     1184781 :         if (ispartialpage)
                               2284                 :         {
                               2285                 :             /* Only asked to write a partial page */
                               2286          294461 :             LogwrtResult.Write = WriteRqst.Write;
 8062 tgl                      2287 CBC      294461 :             break;
                               2288                 :         }
 6439 tgl                      2289 GIC      890320 :         curridx = NextBufIdx(curridx);
 6439 tgl                      2290 ECB             : 
                               2291                 :         /* If flexible, break out of loop as soon as we wrote something */
 6439 tgl                      2292 GIC      890320 :         if (flexible && npages == 0)
 6439 tgl                      2293 CBC        4372 :             break;
                               2294                 :     }
                               2295                 : 
                               2296          731996 :     Assert(npages == 0);
 8595 vadim4o                  2297 ECB             : 
                               2298                 :     /*
                               2299                 :      * If asked to flush, do so
 8062 tgl                      2300                 :      */
 3754 alvherre                 2301 GIC      731996 :     if (LogwrtResult.Flush < WriteRqst.Flush &&
                               2302          296291 :         LogwrtResult.Flush < LogwrtResult.Write)
                               2303                 :     {
                               2304                 :         /*
 6385 bruce                    2305 ECB             :          * Could get here without iterating above loop, in which case we might
 3260                          2306                 :          * have no open file or the wrong one.  However, we do not need to
                               2307                 :          * fsync more than one file.
                               2308                 :          */
 5441 tgl                      2309 GIC      296260 :         if (sync_method != SYNC_METHOD_OPEN &&
                               2310          296260 :             sync_method != SYNC_METHOD_OPEN_DSYNC)
                               2311                 :         {
 8059                          2312          296260 :             if (openLogFile >= 0 &&
 2028 andres                   2313 CBC      296253 :                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
 2028 andres                   2314 ECB             :                                  wal_segment_size))
 6142 bruce                    2315 UIC           0 :                 XLogFileClose();
 8059 tgl                      2316 CBC      296260 :             if (openLogFile < 0)
 8059 tgl                      2317 ECB             :             {
 2028 andres                   2318 GIC           7 :                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 2028 andres                   2319 EUB             :                                 wal_segment_size);
  520 rhaas                    2320 CBC           7 :                 openLogTLI = tli;
  520 rhaas                    2321 GIC           7 :                 openLogFile = XLogFileOpen(openLogSegNo, tli);
 1140 tgl                      2322 CBC           7 :                 ReserveExternalFD();
                               2323                 :             }
 3933 rhaas                    2324 ECB             : 
  520 rhaas                    2325 CBC      296260 :             issue_xlog_fsync(openLogFile, openLogSegNo, tli);
 8062 tgl                      2326 ECB             :         }
                               2327                 : 
                               2328                 :         /* signal that we need to wakeup walsenders later */
 3933 rhaas                    2329 CBC      296260 :         WalSndWakeupRequest();
                               2330                 : 
 8062 tgl                      2331 GIC      296260 :         LogwrtResult.Flush = LogwrtResult.Write;
                               2332                 :     }
 8595 vadim4o                  2333 ECB             : 
                               2334                 :     /*
 8062 tgl                      2335                 :      * Update shared-memory status
                               2336                 :      *
                               2337                 :      * We make sure that the shared 'request' values do not fall behind the
                               2338                 :      * 'result' values.  This is not absolutely essential, but it saves some
                               2339                 :      * code in a couple of places.
                               2340                 :      */
                               2341                 :     {
 3121 andres                   2342 GIC      731996 :         SpinLockAcquire(&XLogCtl->info_lck);
                               2343          731996 :         XLogCtl->LogwrtResult = LogwrtResult;
                               2344          731996 :         if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
                               2345          285674 :             XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
 3121 andres                   2346 CBC      731996 :         if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
                               2347          296653 :             XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
                               2348          731996 :         SpinLockRelease(&XLogCtl->info_lck);
 7772 tgl                      2349 ECB             :     }
 8062 tgl                      2350 CBC      731996 : }
 8062 tgl                      2351 ECB             : 
 5730                          2352                 : /*
                               2353                 :  * Record the LSN for an asynchronous transaction commit/abort
 3988                          2354                 :  * and nudge the WALWriter if there is work for it to do.
                               2355                 :  * (This should not be called for synchronous commits.)
                               2356                 :  */
                               2357                 : void
 4637 simon                    2358 GIC       60542 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
                               2359                 : {
 4165                          2360           60542 :     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
                               2361                 :     bool        sleeping;
 4165 simon                    2362 ECB             : 
 3121 andres                   2363 GIC       60542 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   2364 CBC       60542 :     LogwrtResult = XLogCtl->LogwrtResult;
 3121 andres                   2365 GIC       60542 :     sleeping = XLogCtl->WalWriterSleeping;
                               2366           60542 :     if (XLogCtl->asyncXactLSN < asyncXactLSN)
 3121 andres                   2367 CBC       60214 :         XLogCtl->asyncXactLSN = asyncXactLSN;
                               2368           60542 :     SpinLockRelease(&XLogCtl->info_lck);
 4165 simon                    2369 ECB             : 
 3988 tgl                      2370                 :     /*
                               2371                 :      * If the WALWriter is sleeping, we should kick it to make it come out of
 3260 bruce                    2372                 :      * low-power mode.  Otherwise, determine whether there's a full page of
                               2373                 :      * WAL available to write.
                               2374                 :      */
 3988 tgl                      2375 GIC       60542 :     if (!sleeping)
                               2376                 :     {
                               2377                 :         /* back off to last completed page boundary */
 3941 heikki.linnakangas       2378           60508 :         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
 4165 simon                    2379 ECB             : 
                               2380                 :         /* if we have already flushed that far, we're done */
 3754 alvherre                 2381 GIC       60508 :         if (WriteRqstPtr <= LogwrtResult.Flush)
 3988 tgl                      2382 CBC       17598 :             return;
                               2383                 :     }
                               2384                 : 
 4165 simon                    2385 ECB             :     /*
 3955 bruce                    2386                 :      * Nudge the WALWriter: it has a full page of WAL to write, or we want it
                               2387                 :      * to come out of low-power mode so that this async commit will reach disk
                               2388                 :      * within the expected amount of time.
                               2389                 :      */
 3988 tgl                      2390 GIC       42944 :     if (ProcGlobal->walwriterLatch)
                               2391            8097 :         SetLatch(ProcGlobal->walwriterLatch);
                               2392                 : }
                               2393                 : 
 3355 rhaas                    2394 ECB             : /*
                               2395                 :  * Record the LSN up to which we can remove WAL because it's not required by
                               2396                 :  * any replication slot.
                               2397                 :  */
                               2398                 : void
 3355 rhaas                    2399 GIC       19623 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
                               2400                 : {
 3121 andres                   2401           19623 :     SpinLockAcquire(&XLogCtl->info_lck);
                               2402           19623 :     XLogCtl->replicationSlotMinLSN = lsn;
 3121 andres                   2403 CBC       19623 :     SpinLockRelease(&XLogCtl->info_lck);
 3355 rhaas                    2404 GIC       19623 : }
 3355 rhaas                    2405 ECB             : 
                               2406                 : 
                               2407                 : /*
                               2408                 :  * Return the oldest LSN we must retain to satisfy the needs of some
                               2409                 :  * replication slot.
                               2410                 :  */
                               2411                 : static XLogRecPtr
 3355 rhaas                    2412 GIC        2660 : XLogGetReplicationSlotMinimumLSN(void)
                               2413                 : {
                               2414                 :     XLogRecPtr  retval;
                               2415                 : 
 3121 andres                   2416 CBC        2660 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   2417 GIC        2660 :     retval = XLogCtl->replicationSlotMinLSN;
                               2418            2660 :     SpinLockRelease(&XLogCtl->info_lck);
                               2419                 : 
 3355 rhaas                    2420 CBC        2660 :     return retval;
 3355 rhaas                    2421 ECB             : }
                               2422                 : 
                               2423                 : /*
 5163 heikki.linnakangas       2424                 :  * Advance minRecoveryPoint in control file.
                               2425                 :  *
                               2426                 :  * If we crash during recovery, we must reach this point again before the
                               2427                 :  * database is consistent.
                               2428                 :  *
                               2429                 :  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
                               2430                 :  * is only updated if it's not already greater than or equal to 'lsn'.
                               2431                 :  */
                               2432                 : static void
 5163 heikki.linnakangas       2433 GIC       87149 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
                               2434                 : {
                               2435                 :     /* Quick check using our local copy of the variable */
  417                          2436           87149 :     if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
 5163 heikki.linnakangas       2437 CBC       80958 :         return;
                               2438                 : 
                               2439                 :     /*
 1739 michael                  2440 ECB             :      * An invalid minRecoveryPoint means that we need to recover all the WAL,
                               2441                 :      * i.e., we're doing crash recovery.  We never modify the control file's
                               2442                 :      * value in that case, so we can short-circuit future checks here too. The
                               2443                 :      * local values of minRecoveryPoint and minRecoveryPointTLI should not be
                               2444                 :      * updated until crash recovery finishes.  We only do this for the startup
                               2445                 :      * process as it should not update its own reference of minRecoveryPoint
                               2446                 :      * until it has finished crash recovery to make sure that all WAL
                               2447                 :      * available is replayed in this case.  This also saves from extra locks
                               2448                 :      * taken on the control file from the startup process.
                               2449                 :      */
  417 heikki.linnakangas       2450 GIC        6191 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
                               2451                 :     {
 1739 michael                  2452              25 :         updateMinRecoveryPoint = false;
                               2453              25 :         return;
 1739 michael                  2454 ECB             :     }
                               2455                 : 
 5163 heikki.linnakangas       2456 CBC        6166 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 5163 heikki.linnakangas       2457 ECB             : 
                               2458                 :     /* update local copy */
  417 heikki.linnakangas       2459 GIC        6166 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
  417 heikki.linnakangas       2460 CBC        6166 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                               2461                 : 
  417 heikki.linnakangas       2462 GIC        6166 :     if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
 1682 michael                  2463 CBC           1 :         updateMinRecoveryPoint = false;
  417 heikki.linnakangas       2464            6165 :     else if (force || LocalMinRecoveryPoint < lsn)
                               2465                 :     {
 5050 bruce                    2466 ECB             :         XLogRecPtr  newMinRecoveryPoint;
 3778 heikki.linnakangas       2467                 :         TimeLineID  newMinRecoveryPointTLI;
 5163                          2468                 : 
                               2469                 :         /*
                               2470                 :          * To avoid having to update the control file too often, we update it
                               2471                 :          * all the way to the last record being replayed, even though 'lsn'
                               2472                 :          * would suffice for correctness.  This also allows the 'force' case
                               2473                 :          * to not need a valid 'lsn' value.
                               2474                 :          *
                               2475                 :          * Another important reason for doing it this way is that the passed
                               2476                 :          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
                               2477                 :          * the caller got it from a corrupted heap page.  Accepting such a
                               2478                 :          * value as the min recovery point would prevent us from coming up at
                               2479                 :          * all.  Instead, we just log a warning and continue with recovery.
                               2480                 :          * (See also the comments about corrupt LSNs in XLogFlush.)
                               2481                 :          */
  417 heikki.linnakangas       2482 GIC        5869 :         newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
 3754 alvherre                 2483            5869 :         if (!force && newMinRecoveryPoint < lsn)
 5035 tgl                      2484 UIC           0 :             elog(WARNING,
                               2485                 :                  "xlog min recovery request %X/%X is past current point %X/%X",
  775 peter                    2486 ECB             :                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
 5035 tgl                      2487                 : 
 5163 heikki.linnakangas       2488 EUB             :         /* update control file */
 3754 alvherre                 2489 GIC        5869 :         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
                               2490                 :         {
 5163 heikki.linnakangas       2491            5849 :             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
 3778                          2492            5849 :             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
 5163 heikki.linnakangas       2493 CBC        5849 :             UpdateControlFile();
  417 heikki.linnakangas       2494 GIC        5849 :             LocalMinRecoveryPoint = newMinRecoveryPoint;
  417 heikki.linnakangas       2495 CBC        5849 :             LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
 5163 heikki.linnakangas       2496 ECB             : 
 5163 heikki.linnakangas       2497 CBC        5849 :             ereport(DEBUG2,
  781 peter                    2498 ECB             :                     (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
  417 heikki.linnakangas       2499                 :                                      LSN_FORMAT_ARGS(newMinRecoveryPoint),
                               2500                 :                                      newMinRecoveryPointTLI)));
 5163                          2501                 :         }
                               2502                 :     }
 5163 heikki.linnakangas       2503 GIC        6166 :     LWLockRelease(ControlFileLock);
                               2504                 : }
                               2505                 : 
                               2506                 : /*
 8062 tgl                      2507 ECB             :  * Ensure that all XLOG data through the given position is flushed to disk.
                               2508                 :  *
                               2509                 :  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
                               2510                 :  * already held, and we try to avoid acquiring it if possible.
                               2511                 :  */
                               2512                 : void
 8062 tgl                      2513 GIC     1019619 : XLogFlush(XLogRecPtr record)
                               2514                 : {
                               2515                 :     XLogRecPtr  WriteRqstPtr;
                               2516                 :     XLogwrtRqst WriteRqst;
  515 rhaas                    2517 CBC     1019619 :     TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
                               2518                 : 
                               2519                 :     /*
                               2520                 :      * During REDO, we are reading not writing WAL.  Therefore, instead of
 4790 bruce                    2521 ECB             :      * trying to flush the WAL, we should update minRecoveryPoint instead. We
                               2522                 :      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
                               2523                 :      * to act this way too, and because when it tries to write the
                               2524                 :      * end-of-recovery checkpoint, it should indeed flush.
                               2525                 :      */
 5035 tgl                      2526 GIC     1019619 :     if (!XLogInsertAllowed())
                               2527                 :     {
 5163 heikki.linnakangas       2528           87067 :         UpdateMinRecoveryPoint(record, false);
 8062 tgl                      2529          714330 :         return;
 5163 heikki.linnakangas       2530 ECB             :     }
                               2531                 : 
 8062 tgl                      2532                 :     /* Quick exit if already known flushed */
 3754 alvherre                 2533 CBC      932552 :     if (record <= LogwrtResult.Flush)
 8062 tgl                      2534 GIC      627263 :         return;
                               2535                 : 
                               2536                 : #ifdef WAL_DEBUG
 7352 tgl                      2537 ECB             :     if (XLOG_DEBUG)
 7202                          2538                 :         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
                               2539                 :              LSN_FORMAT_ARGS(record),
                               2540                 :              LSN_FORMAT_ARGS(LogwrtResult.Write),
                               2541                 :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
                               2542                 : #endif
                               2543                 : 
 8062 tgl                      2544 GIC      305289 :     START_CRIT_SECTION();
                               2545                 : 
                               2546                 :     /*
                               2547                 :      * Since fsync is usually a horribly expensive operation, we try to
 6385 bruce                    2548 ECB             :      * piggyback as much data as we can on each fsync: if we see any more data
                               2549                 :      * entered into the xlog buffer, we'll write and fsync that too, so that
                               2550                 :      * the final value of LogwrtResult.Flush is as large as possible. This
                               2551                 :      * gives us some chance of avoiding another fsync immediately after.
                               2552                 :      */
                               2553                 : 
                               2554                 :     /* initialize to given target; may increase below */
 8062 tgl                      2555 GIC      305289 :     WriteRqstPtr = record;
                               2556                 : 
                               2557                 :     /*
                               2558                 :      * Now wait until we get the write lock, or someone else does the flush
 3955 bruce                    2559 ECB             :      * for us.
                               2560                 :      */
                               2561                 :     for (;;)
 7772 tgl                      2562 GIC         737 :     {
                               2563                 :         XLogRecPtr  insertpos;
                               2564                 : 
                               2565                 :         /* read LogwrtResult and update local state */
 3121 andres                   2566 CBC      306026 :         SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   2567 GIC      306026 :         if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
                               2568           13438 :             WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
                               2569          306026 :         LogwrtResult = XLogCtl->LogwrtResult;
 3121 andres                   2570 CBC      306026 :         SpinLockRelease(&XLogCtl->info_lck);
 7862 tgl                      2571 ECB             : 
 4087 heikki.linnakangas       2572                 :         /* done already? */
 3754 alvherre                 2573 CBC      306026 :         if (record <= LogwrtResult.Flush)
 4087 heikki.linnakangas       2574            9956 :             break;
                               2575                 : 
                               2576                 :         /*
 3562 heikki.linnakangas       2577 ECB             :          * Before actually performing the write, wait for all in-flight
                               2578                 :          * insertions to the pages we're about to write to finish.
                               2579                 :          */
 3562 heikki.linnakangas       2580 GIC      296070 :         insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
                               2581                 : 
                               2582                 :         /*
                               2583                 :          * Try to get the write lock. If we can't get it immediately, wait
 4087 heikki.linnakangas       2584 ECB             :          * until it's released, and recheck if we still need to do the flush
                               2585                 :          * or if the backend that held the lock did it for us already. This
                               2586                 :          * helps to maintain a good rate of group committing when the system
                               2587                 :          * is bottlenecked by the speed of fsyncing.
                               2588                 :          */
 4078 heikki.linnakangas       2589 GIC      296070 :         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
                               2590                 :         {
                               2591                 :             /*
                               2592                 :              * The lock is now free, but we didn't acquire it yet. Before we
 4087 heikki.linnakangas       2593 ECB             :              * do, loop back to check if someone else flushed the record for
                               2594                 :              * us already.
                               2595                 :              */
 4087 heikki.linnakangas       2596 GIC         737 :             continue;
                               2597                 :         }
                               2598                 : 
                               2599                 :         /* Got the lock; recheck whether request is satisfied */
 4051 heikki.linnakangas       2600 CBC      295333 :         LogwrtResult = XLogCtl->LogwrtResult;
 3754 alvherre                 2601 GIC      295333 :         if (record <= LogwrtResult.Flush)
                               2602                 :         {
 3933 rhaas                    2603             642 :             LWLockRelease(WALWriteLock);
 3933 rhaas                    2604 CBC         642 :             break;
 3933 rhaas                    2605 ECB             :         }
                               2606                 : 
                               2607                 :         /*
                               2608                 :          * Sleep before flush! By adding a delay here, we may give further
                               2609                 :          * backends the opportunity to join the backlog of group commit
                               2610                 :          * followers; this can significantly improve transaction throughput,
                               2611                 :          * at the risk of increasing transaction latency.
                               2612                 :          *
                               2613                 :          * We do not sleep if enableFsync is not turned on, nor if there are
                               2614                 :          * fewer than CommitSiblings other backends with active transactions.
                               2615                 :          */
 3933 rhaas                    2616 GIC      294691 :         if (CommitDelay > 0 && enableFsync &&
 3933 rhaas                    2617 UIC           0 :             MinimumActiveBackends(CommitSiblings))
                               2618                 :         {
                               2619               0 :             pg_usleep(CommitDelay);
 3933 rhaas                    2620 ECB             : 
 3562 heikki.linnakangas       2621 EUB             :             /*
                               2622                 :              * Re-check how far we can now flush the WAL. It's generally not
 2882 simon                    2623                 :              * safe to call WaitXLogInsertionsToFinish while holding
                               2624                 :              * WALWriteLock, because an in-progress insertion might need to
                               2625                 :              * also grab WALWriteLock to make progress. But we know that all
                               2626                 :              * the insertions up to insertpos have already finished, because
                               2627                 :              * that's what the earlier WaitXLogInsertionsToFinish() returned.
                               2628                 :              * We're only calling it again to allow insertpos to be moved
                               2629                 :              * further forward, not to actually wait for anyone.
                               2630                 :              */
 3562 heikki.linnakangas       2631 UIC           0 :             insertpos = WaitXLogInsertionsToFinish(insertpos);
                               2632                 :         }
                               2633                 : 
                               2634                 :         /* try to write/flush later additions to XLOG as well */
 3562 heikki.linnakangas       2635 GBC      294691 :         WriteRqst.Write = insertpos;
 3562 heikki.linnakangas       2636 GIC      294691 :         WriteRqst.Flush = insertpos;
                               2637                 : 
  520 rhaas                    2638          294691 :         XLogWrite(WriteRqst, insertTLI, false);
 3933 rhaas                    2639 ECB             : 
 7862 tgl                      2640 CBC      294691 :         LWLockRelease(WALWriteLock);
                               2641                 :         /* done */
 4087 heikki.linnakangas       2642          294691 :         break;
                               2643                 :     }
 8062 tgl                      2644 ECB             : 
 8062 tgl                      2645 GIC      305289 :     END_CRIT_SECTION();
 7755 tgl                      2646 ECB             : 
                               2647                 :     /* wake up walsenders now that we've released heavily contended locks */
    1 andres                   2648 GNC      305289 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
 3933 rhaas                    2649 ECB             : 
                               2650                 :     /*
                               2651                 :      * If we still haven't flushed to the request point then we have a
 6385 bruce                    2652                 :      * problem; most likely, the requested flush point is past end of XLOG.
                               2653                 :      * This has been seen to occur when a disk page has a corrupted LSN.
                               2654                 :      *
                               2655                 :      * Formerly we treated this as a PANIC condition, but that hurts the
                               2656                 :      * system's robustness rather than helping it: we do not want to take down
                               2657                 :      * the whole system due to corruption on one data page.  In particular, if
                               2658                 :      * the bad page is encountered again during recovery then we would be
                               2659                 :      * unable to restart the database at all!  (This scenario actually
                               2660                 :      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
                               2661                 :      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
                               2662                 :      * the only time we can reach here during recovery is while flushing the
                               2663                 :      * end-of-recovery checkpoint record, and we don't expect that to have a
                               2664                 :      * bad LSN.
                               2665                 :      *
                               2666                 :      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
                               2667                 :      * since xact.c calls this routine inside a critical section.  However,
                               2668                 :      * calls from bufmgr.c are not within critical sections and so we will not
                               2669                 :      * force a restart for a bad LSN on a data page.
                               2670                 :      */
 3754 alvherre                 2671 GIC      305289 :     if (LogwrtResult.Flush < record)
 5035 tgl                      2672 UIC           0 :         elog(ERROR,
                               2673                 :              "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
                               2674                 :              LSN_FORMAT_ARGS(record),
  775 peter                    2675 ECB             :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
 8595 vadim4o                  2676 EUB             : }
                               2677                 : 
                               2678                 : /*
                               2679                 :  * Write & flush xlog, but without specifying exactly where to.
                               2680                 :  *
                               2681                 :  * We normally write only completed blocks; but if there is nothing to do on
                               2682                 :  * that basis, we check for unwritten async commits in the current incomplete
                               2683                 :  * block, and write through the latest one of those.  Thus, if async commits
                               2684                 :  * are not being used, we will write complete blocks only.
                               2685                 :  *
                               2686                 :  * If, based on the above, there's anything to write we do so immediately. But
                               2687                 :  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
                               2688                 :  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
                               2689                 :  * more than wal_writer_flush_after unflushed blocks.
                               2690                 :  *
                               2691                 :  * We can guarantee that async commits reach disk after at most three
                               2692                 :  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
                               2693                 :  * to write "flexibly", meaning it can stop at the end of the buffer ring;
                               2694                 :  * this makes a difference only with very high load or long wal_writer_delay,
                               2695                 :  * but imposes one extra cycle for the worst case for async commits.)
                               2696                 :  *
                               2697                 :  * This routine is invoked periodically by the background walwriter process.
                               2698                 :  *
                               2699                 :  * Returns true if there was any work to do, even if we skipped flushing due
                               2700                 :  * to wal_writer_delay/wal_writer_flush_after.
                               2701                 :  */
                               2702                 : bool
 5738 tgl                      2703 GIC       14942 : XLogBackgroundFlush(void)
                               2704                 : {
                               2705                 :     XLogwrtRqst WriteRqst;
                               2706           14942 :     bool        flexible = true;
 2610 andres                   2707 ECB             :     static TimestampTz lastflush;
                               2708                 :     TimestampTz now;
                               2709                 :     int         flushbytes;
  520 rhaas                    2710                 :     TimeLineID  insertTLI;
                               2711                 : 
                               2712                 :     /* XLOG doesn't need flushing during recovery */
 5163 heikki.linnakangas       2713 GIC       14942 :     if (RecoveryInProgress())
 3988 tgl                      2714               8 :         return false;
                               2715                 : 
                               2716                 :     /*
  515 rhaas                    2717 ECB             :      * Since we're not in recovery, InsertTimeLineID is set and can't change,
  520                          2718                 :      * so we can read it without a lock.
                               2719                 :      */
  515 rhaas                    2720 GIC       14934 :     insertTLI = XLogCtl->InsertTimeLineID;
                               2721                 : 
                               2722                 :     /* read LogwrtResult and update local state */
 3121 andres                   2723           14934 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   2724 CBC       14934 :     LogwrtResult = XLogCtl->LogwrtResult;
 2610 andres                   2725 GIC       14934 :     WriteRqst = XLogCtl->LogwrtRqst;
 3121                          2726           14934 :     SpinLockRelease(&XLogCtl->info_lck);
 5738 tgl                      2727 ECB             : 
                               2728                 :     /* back off to last completed page boundary */
 2610 andres                   2729 CBC       14934 :     WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
 5738 tgl                      2730 ECB             : 
                               2731                 :     /* if we have already flushed that far, consider async commit records */
 2610 andres                   2732 GIC       14934 :     if (WriteRqst.Write <= LogwrtResult.Flush)
 5738 tgl                      2733 ECB             :     {
 3121 andres                   2734 GIC        8201 :         SpinLockAcquire(&XLogCtl->info_lck);
 2610                          2735            8201 :         WriteRqst.Write = XLogCtl->asyncXactLSN;
 3121 andres                   2736 CBC        8201 :         SpinLockRelease(&XLogCtl->info_lck);
 5738 tgl                      2737 GIC        8201 :         flexible = false;       /* ensure it all gets written */
 5738 tgl                      2738 ECB             :     }
                               2739                 : 
 4687 magnus                   2740                 :     /*
 4660 bruce                    2741                 :      * If already known flushed, we're done. Just need to check if we are
                               2742                 :      * holding an open file handle to a logfile that's no longer in use,
                               2743                 :      * preventing the file from being deleted.
                               2744                 :      */
 2610 andres                   2745 GIC       14934 :     if (WriteRqst.Write <= LogwrtResult.Flush)
                               2746                 :     {
 4660 bruce                    2747            7725 :         if (openLogFile >= 0)
                               2748                 :         {
 2028 andres                   2749 CBC        4361 :             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
                               2750                 :                                  wal_segment_size))
 4687 magnus                   2751 ECB             :             {
 4687 magnus                   2752 GIC          79 :                 XLogFileClose();
 4687 magnus                   2753 ECB             :             }
                               2754                 :         }
 3988 tgl                      2755 GIC        7725 :         return false;
 4687 magnus                   2756 ECB             :     }
                               2757                 : 
                               2758                 :     /*
 2610 andres                   2759                 :      * Determine how far to flush WAL, based on the wal_writer_delay and
                               2760                 :      * wal_writer_flush_after GUCs.
                               2761                 :      */
 2610 andres                   2762 GIC        7209 :     now = GetCurrentTimestamp();
                               2763            7209 :     flushbytes =
                               2764            7209 :         WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
                               2765                 : 
 2610 andres                   2766 CBC        7209 :     if (WalWriterFlushAfter == 0 || lastflush == 0)
 2610 andres                   2767 ECB             :     {
                               2768                 :         /* first call, or block based limits disabled */
 2610 andres                   2769 GIC         175 :         WriteRqst.Flush = WriteRqst.Write;
 2610 andres                   2770 CBC         175 :         lastflush = now;
                               2771                 :     }
 2610 andres                   2772 GIC        7034 :     else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
 2610 andres                   2773 ECB             :     {
                               2774                 :         /*
                               2775                 :          * Flush the writes at least every WalWriterDelay ms. This is
 1329 michael                  2776                 :          * important to bound the amount of time it takes for an asynchronous
                               2777                 :          * commit to hit disk.
                               2778                 :          */
 2610 andres                   2779 GIC        1705 :         WriteRqst.Flush = WriteRqst.Write;
                               2780            1705 :         lastflush = now;
                               2781                 :     }
                               2782            5329 :     else if (flushbytes >= WalWriterFlushAfter)
 2610 andres                   2783 ECB             :     {
                               2784                 :         /* exceeded wal_writer_flush_after blocks, flush */
 2610 andres                   2785 GIC           3 :         WriteRqst.Flush = WriteRqst.Write;
 2610 andres                   2786 CBC           3 :         lastflush = now;
                               2787                 :     }
                               2788                 :     else
 2610 andres                   2789 ECB             :     {
                               2790                 :         /* no flushing, this time round */
 2610 andres                   2791 GIC        5326 :         WriteRqst.Flush = 0;
                               2792                 :     }
                               2793                 : 
                               2794                 : #ifdef WAL_DEBUG
 5738 tgl                      2795 ECB             :     if (XLOG_DEBUG)
                               2796                 :         elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
                               2797                 :              LSN_FORMAT_ARGS(WriteRqst.Write),
                               2798                 :              LSN_FORMAT_ARGS(WriteRqst.Flush),
                               2799                 :              LSN_FORMAT_ARGS(LogwrtResult.Write),
                               2800                 :              LSN_FORMAT_ARGS(LogwrtResult.Flush));
                               2801                 : #endif
                               2802                 : 
 5738 tgl                      2803 GIC        7209 :     START_CRIT_SECTION();
                               2804                 : 
                               2805                 :     /* now wait for any in-progress insertions to finish and get write lock */
 2610 andres                   2806            7209 :     WaitXLogInsertionsToFinish(WriteRqst.Write);
 5738 tgl                      2807 CBC        7209 :     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 4051 heikki.linnakangas       2808 GIC        7209 :     LogwrtResult = XLogCtl->LogwrtResult;
 2610 andres                   2809            7209 :     if (WriteRqst.Write > LogwrtResult.Write ||
 2610 andres                   2810 CBC        2366 :         WriteRqst.Flush > LogwrtResult.Flush)
 5738 tgl                      2811 ECB             :     {
  520 rhaas                    2812 CBC        4879 :         XLogWrite(WriteRqst, insertTLI, flexible);
 5738 tgl                      2813 ECB             :     }
 5738 tgl                      2814 CBC        7209 :     LWLockRelease(WALWriteLock);
                               2815                 : 
                               2816            7209 :     END_CRIT_SECTION();
                               2817                 : 
 3933 rhaas                    2818 ECB             :     /* wake up walsenders now that we've released heavily contended locks */
    1 andres                   2819 GNC        7209 :     WalSndWakeupProcessRequests(true, !RecoveryInProgress());
 3958 simon                    2820 ECB             : 
                               2821                 :     /*
                               2822                 :      * Great, done. To take some work off the critical path, try to initialize
 3562 heikki.linnakangas       2823                 :      * as many of the no-longer-needed WAL buffers for future use as we can.
                               2824                 :      */
  520 rhaas                    2825 GIC        7209 :     AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
                               2826                 : 
                               2827                 :     /*
                               2828                 :      * If we determined that we need to write data, but somebody else
 2610 andres                   2829 ECB             :      * wrote/flushed already, it should be considered as being active, to
                               2830                 :      * avoid hibernating too early.
                               2831                 :      */
 2610 andres                   2832 GIC        7209 :     return true;
                               2833                 : }
                               2834                 : 
                               2835                 : /*
 5793 tgl                      2836 ECB             :  * Test whether XLOG data has been flushed up to (at least) the given position.
                               2837                 :  *
                               2838                 :  * Returns true if a flush is still needed.  (It may be that someone else
                               2839                 :  * is already in process of flushing that far, however.)
                               2840                 :  */
                               2841                 : bool
 5793 tgl                      2842 GIC    14946151 : XLogNeedsFlush(XLogRecPtr record)
                               2843                 : {
                               2844                 :     /*
                               2845                 :      * During recovery, we don't flush WAL but update minRecoveryPoint
 4859 simon                    2846 ECB             :      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
                               2847                 :      * would need to be updated.
                               2848                 :      */
 5163 heikki.linnakangas       2849 GIC    14946151 :     if (RecoveryInProgress())
                               2850                 :     {
                               2851                 :         /*
                               2852                 :          * An invalid minRecoveryPoint means that we need to recover all the
 1739 michael                  2853 ECB             :          * WAL, i.e., we're doing crash recovery.  We never modify the control
                               2854                 :          * file's value in that case, so we can short-circuit future checks
                               2855                 :          * here too.  This triggers a quick exit path for the startup process,
                               2856                 :          * which cannot update its local copy of minRecoveryPoint as long as
                               2857                 :          * it has not replayed all WAL available when doing crash recovery.
                               2858                 :          */
  417 heikki.linnakangas       2859 GIC     1407027 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
 1739 michael                  2860 UIC           0 :             updateMinRecoveryPoint = false;
                               2861                 : 
                               2862                 :         /* Quick exit if already known to be updated or cannot be updated */
  417 heikki.linnakangas       2863 CBC     1407027 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
 4859 simon                    2864 GBC     1392862 :             return false;
                               2865                 : 
                               2866                 :         /*
 4859 simon                    2867 ECB             :          * Update local copy of minRecoveryPoint. But if the lock is busy,
                               2868                 :          * just return a conservative guess.
                               2869                 :          */
 4859 simon                    2870 GIC       14165 :         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
 4859 simon                    2871 UIC           0 :             return true;
  417 heikki.linnakangas       2872 GIC       14165 :         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
                               2873           14165 :         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 4859 simon                    2874 CBC       14165 :         LWLockRelease(ControlFileLock);
 4859 simon                    2875 EUB             : 
 1682 michael                  2876 ECB             :         /*
                               2877                 :          * Check minRecoveryPoint for any other process than the startup
                               2878                 :          * process doing crash recovery, which should not update the control
                               2879                 :          * file value if crash recovery is still running.
                               2880                 :          */
  417 heikki.linnakangas       2881 GIC       14165 :         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
 1682 michael                  2882 UIC           0 :             updateMinRecoveryPoint = false;
                               2883                 : 
                               2884                 :         /* check again */
  417 heikki.linnakangas       2885 CBC       14165 :         if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
 1682 michael                  2886 GBC          77 :             return false;
                               2887                 :         else
 1682 michael                  2888 GIC       14088 :             return true;
 4859 simon                    2889 ECB             :     }
 5163 heikki.linnakangas       2890                 : 
                               2891                 :     /* Quick exit if already known flushed */
 3754 alvherre                 2892 CBC    13539124 :     if (record <= LogwrtResult.Flush)
 5793 tgl                      2893 GIC    13431597 :         return false;
                               2894                 : 
                               2895                 :     /* read LogwrtResult and update local state */
 3121 andres                   2896 CBC      107527 :     SpinLockAcquire(&XLogCtl->info_lck);
                               2897          107527 :     LogwrtResult = XLogCtl->LogwrtResult;
 3121 andres                   2898 GIC      107527 :     SpinLockRelease(&XLogCtl->info_lck);
                               2899                 : 
 5793 tgl                      2900 ECB             :     /* check again */
 3754 alvherre                 2901 CBC      107527 :     if (record <= LogwrtResult.Flush)
 5793 tgl                      2902            2195 :         return false;
                               2903                 : 
 5793 tgl                      2904 GIC      105332 :     return true;
 5793 tgl                      2905 ECB             : }
                               2906                 : 
                               2907                 : /*
  650 noah                     2908                 :  * Try to make a given XLOG file segment exist.
                               2909                 :  *
                               2910                 :  * logsegno: identify segment.
                               2911                 :  *
                               2912                 :  * *added: on return, true if this call raised the number of extant segments.
                               2913                 :  *
                               2914                 :  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
                               2915                 :  *
                               2916                 :  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
                               2917                 :  * wanting an open segment should attempt to open "path", which usually will
                               2918                 :  * succeed.  (This is weird, but it's efficient for the callers.)
                               2919                 :  */
                               2920                 : static int
  520 rhaas                    2921 GIC        7666 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
                               2922                 :                      bool *added, char *path)
                               2923                 : {
                               2924                 :     char        tmppath[MAXPGPATH];
                               2925                 :     XLogSegNo   installed_segno;
                               2926                 :     XLogSegNo   max_segno;
                               2927                 :     int         fd;
                               2928                 :     int         save_errno;
    1 tmunro                   2929 GNC        7666 :     int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
                               2930                 : 
  520 rhaas                    2931 GIC        7666 :     Assert(logtli != 0);
                               2932                 : 
  520 rhaas                    2933 CBC        7666 :     XLogFilePath(path, logtli, logsegno, wal_segment_size);
                               2934                 : 
 8190 vadim4o                  2935 ECB             :     /*
                               2936                 :      * Try to use existent file (checkpoint maker may have created it already)
                               2937                 :      */
  650 noah                     2938 GIC        7666 :     *added = false;
   37 tmunro                   2939 GNC        7666 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
                               2940            7666 :                        get_sync_bit(sync_method));
  650 noah                     2941 GIC        7666 :     if (fd < 0)
                               2942                 :     {
  650 noah                     2943 CBC         684 :         if (errno != ENOENT)
  650 noah                     2944 LBC           0 :             ereport(ERROR,
  650 noah                     2945 ECB             :                     (errcode_for_file_access(),
                               2946                 :                      errmsg("could not open file \"%s\": %m", path)));
                               2947                 :     }
                               2948                 :     else
  650 noah                     2949 GBC        6982 :         return fd;
                               2950                 : 
                               2951                 :     /*
                               2952                 :      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
                               2953                 :      * another process is doing the same thing.  If so, we will end up
 6385 bruce                    2954 ECB             :      * pre-creating an extra log segment.  That seems OK, and better than
                               2955                 :      * holding the lock throughout this lengthy process.
                               2956                 :      */
 5762 tgl                      2957 GIC         684 :     elog(DEBUG2, "creating and filling new WAL file");
                               2958                 : 
 6488                          2959             684 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
                               2960                 : 
 8058                          2961             684 :     unlink(tmppath);
 8595 vadim4o                  2962 ECB             : 
    1 tmunro                   2963 GNC         684 :     if (io_direct_flags & IO_DIRECT_WAL_INIT)
    1 tmunro                   2964 UNC           0 :         open_flags |= PG_O_DIRECT;
                               2965                 : 
                               2966                 :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    1 tmunro                   2967 GNC         684 :     fd = BasicOpenFile(tmppath, open_flags);
 8595 vadim4o                  2968 GIC         684 :     if (fd < 0)
 6568 tgl                      2969 LBC           0 :         ereport(ERROR,
                               2970                 :                 (errcode_for_file_access(),
 7136 peter_e                  2971 ECB             :                  errmsg("could not create file \"%s\": %m", tmppath)));
 8595 vadim4o                  2972 EUB             : 
 1468 tmunro                   2973 CBC         684 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
                               2974             684 :     save_errno = 0;
 1468 tmunro                   2975 GBC         684 :     if (wal_init_zero)
                               2976                 :     {
                               2977                 :         ssize_t     rc;
  818 tmunro                   2978 ECB             : 
 1468                          2979                 :         /*
                               2980                 :          * Zero-fill the file.  With this setting, we do this the hard way to
                               2981                 :          * ensure that all the file space has really been allocated.  On
                               2982                 :          * platforms that allow "holes" in files, just seeking to the end
                               2983                 :          * doesn't allocate intermediate space.  This way, we know that we
                               2984                 :          * have all the space and (after the fsync below) that all the
                               2985                 :          * indirect blocks are down on disk.  Therefore, fdatasync(2) or
                               2986                 :          * O_DSYNC will be sufficient to sync future writes to the log file.
                               2987                 :          */
   34 michael                  2988 GNC         684 :         rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
                               2989                 : 
  152                          2990             684 :         if (rc < 0)
  152 michael                  2991 UNC           0 :             save_errno = errno;
 1468 tmunro                   2992 ECB             :     }
                               2993                 :     else
                               2994                 :     {
                               2995                 :         /*
                               2996                 :          * Otherwise, seeking to the end and writing a solitary byte is
                               2997                 :          * enough.
                               2998                 :          */
 3504 jdavis                   2999 UBC           0 :         errno = 0;
  152 michael                  3000 UNC           0 :         if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
 3565 jdavis                   3001 EUB             :         {
                               3002                 :             /* if write didn't set errno, assume no disk space */
 1468 tmunro                   3003 UBC           0 :             save_errno = errno ? errno : ENOSPC;
                               3004                 :         }
 1468 tmunro                   3005 EUB             :     }
 1468 tmunro                   3006 GIC         684 :     pgstat_report_wait_end();
                               3007                 : 
                               3008             684 :     if (save_errno)
                               3009                 :     {
 1468 tmunro                   3010 ECB             :         /*
                               3011                 :          * If we fail to make the file, delete it to release disk space
                               3012                 :          */
 1468 tmunro                   3013 UBC           0 :         unlink(tmppath);
 3565 jdavis                   3014 EUB             : 
 1468 tmunro                   3015 UBC           0 :         close(fd);
 3565 jdavis                   3016 EUB             : 
 1468 tmunro                   3017 UIC           0 :         errno = save_errno;
                               3018                 : 
                               3019               0 :         ereport(ERROR,
 1468 tmunro                   3020 ECB             :                 (errcode_for_file_access(),
                               3021                 :                  errmsg("could not write to file \"%s\": %m", tmppath)));
 8085 tgl                      3022                 :     }
 8595 vadim4o                  3023 EUB             : 
 2213 rhaas                    3024 GIC         684 :     pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
 8157 tgl                      3025             684 :     if (pg_fsync(fd) != 0)
                               3026                 :     {
  226 drowley                  3027 UNC           0 :         save_errno = errno;
 3785 heikki.linnakangas       3028 UIC           0 :         close(fd);
 1749 michael                  3029               0 :         errno = save_errno;
 6568 tgl                      3030               0 :         ereport(ERROR,
 7202 tgl                      3031 ECB             :                 (errcode_for_file_access(),
                               3032                 :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
                               3033                 :     }
 2213 rhaas                    3034 GIC         684 :     pgstat_report_wait_end();
                               3035                 : 
 1373 peter                    3036             684 :     if (close(fd) != 0)
 6568 tgl                      3037 UIC           0 :         ereport(ERROR,
                               3038                 :                 (errcode_for_file_access(),
                               3039                 :                  errmsg("could not close file \"%s\": %m", tmppath)));
                               3040                 : 
                               3041                 :     /*
  650 noah                     3042 ECB             :      * Now move the segment into place with its final name.  Cope with
                               3043                 :      * possibility that someone else has created the file while we were
                               3044                 :      * filling ours: if so, use ours to pre-create a future log segment.
                               3045                 :      */
 3941 heikki.linnakangas       3046 CBC         684 :     installed_segno = logsegno;
 2967 heikki.linnakangas       3047 ECB             : 
                               3048                 :     /*
                               3049                 :      * XXX: What should we use as max_segno? We used to use XLOGfileslop when
                               3050                 :      * that was a constant, but that was always a bit dubious: normally, at a
                               3051                 :      * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
                               3052                 :      * here, it was the offset from the insert location. We can't do the
                               3053                 :      * normal XLOGfileslop calculation here because we don't have access to
                               3054                 :      * the prior checkpoint's redo location. So somewhat arbitrarily, just use
                               3055                 :      * CheckPointSegments.
 2967 heikki.linnakangas       3056 EUB             :      */
 2967 heikki.linnakangas       3057 GBC         684 :     max_segno = logsegno + CheckPointSegments;
  520 rhaas                    3058 GIC         684 :     if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
                               3059                 :                                logtli))
  650 noah                     3060 ECB             :     {
  650 noah                     3061 GIC         684 :         *added = true;
                               3062             684 :         elog(DEBUG2, "done creating and filling new WAL file");
                               3063                 :     }
                               3064                 :     else
                               3065                 :     {
                               3066                 :         /*
                               3067                 :          * No need for any more future segments, or InstallXLogFileSegment()
                               3068                 :          * failed to rename the file into place. If the rename failed, a
                               3069                 :          * caller opening the file may fail.
                               3070                 :          */
 7934 tgl                      3071 UIC           0 :         unlink(tmppath);
  650 noah                     3072               0 :         elog(DEBUG2, "abandoned new WAL file");
                               3073                 :     }
                               3074                 : 
  650 noah                     3075 GIC         684 :     return -1;
  650 noah                     3076 ECB             : }
                               3077                 : 
                               3078                 : /*
                               3079                 :  * Create a new XLOG file segment, or open a pre-existing one.
                               3080                 :  *
                               3081                 :  * logsegno: identify segment to be created/opened.
                               3082                 :  *
                               3083                 :  * Returns FD of opened file.
                               3084                 :  *
                               3085                 :  * Note: errors here are ERROR not PANIC because we might or might not be
                               3086                 :  * inside a critical section (eg, during checkpoint there is no reason to
                               3087                 :  * take down the system on failure).  They will promote to PANIC if we are
                               3088                 :  * in a critical section.
                               3089                 :  */
                               3090                 : int
  520 rhaas                    3091 CBC        7624 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
  650 noah                     3092 EUB             : {
                               3093                 :     bool        ignore_added;
                               3094                 :     char        path[MAXPGPATH];
  650 noah                     3095 ECB             :     int         fd;
                               3096                 : 
  520 rhaas                    3097 GIC        7624 :     Assert(logtli != 0);
                               3098                 : 
                               3099            7624 :     fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
  650 noah                     3100            7624 :     if (fd >= 0)
                               3101            6949 :         return fd;
                               3102                 : 
                               3103                 :     /* Now open original target segment (might not be file I just made) */
   37 tmunro                   3104 GNC         675 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
                               3105             675 :                        get_sync_bit(sync_method));
 7934 tgl                      3106 GIC         675 :     if (fd < 0)
 6568 tgl                      3107 UIC           0 :         ereport(ERROR,
                               3108                 :                 (errcode_for_file_access(),
                               3109                 :                  errmsg("could not open file \"%s\": %m", path)));
 6297 neilc                    3110 GIC         675 :     return fd;
                               3111                 : }
                               3112                 : 
                               3113                 : /*
                               3114                 :  * Create a new XLOG file segment by copying a pre-existing one.
 6836 tgl                      3115 ECB             :  *
                               3116                 :  * destsegno: identify segment to be created.
                               3117                 :  *
                               3118                 :  * srcTLI, srcsegno: identify segment to be copied (could be from
                               3119                 :  *      a different timeline)
                               3120                 :  *
                               3121                 :  * upto: how much of the source file to copy (the rest is filled with
                               3122                 :  *      zeros)
                               3123                 :  *
                               3124                 :  * Currently this is only used during recovery, and so there are no locking
                               3125                 :  * considerations.  But we should be just as tense as XLogFileInit to avoid
                               3126                 :  * emplacing a bogus file.
                               3127                 :  */
                               3128                 : static void
  520 rhaas                    3129 CBC          30 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
  520 rhaas                    3130 ECB             :              TimeLineID srcTLI, XLogSegNo srcsegno,
 2839 fujii                    3131                 :              int upto)
 6836 tgl                      3132 EUB             : {
                               3133                 :     char        path[MAXPGPATH];
                               3134                 :     char        tmppath[MAXPGPATH];
                               3135                 :     PGAlignedXLogBlock buffer;
                               3136                 :     int         srcfd;
                               3137                 :     int         fd;
                               3138                 :     int         nbytes;
 6836 tgl                      3139 ECB             : 
                               3140                 :     /*
                               3141                 :      * Open the source file
                               3142                 :      */
 2028 andres                   3143 GIC          30 :     XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
 2024 peter_e                  3144 CBC          30 :     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 6836 tgl                      3145              30 :     if (srcfd < 0)
 6568 tgl                      3146 UBC           0 :         ereport(ERROR,
                               3147                 :                 (errcode_for_file_access(),
                               3148                 :                  errmsg("could not open file \"%s\": %m", path)));
                               3149                 : 
                               3150                 :     /*
                               3151                 :      * Copy into a temp file name.
                               3152                 :      */
 6488 tgl                      3153 CBC          30 :     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
                               3154                 : 
 6836 tgl                      3155 GIC          30 :     unlink(tmppath);
                               3156                 : 
 5443 magnus                   3157 ECB             :     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
 2024 peter_e                  3158 GIC          30 :     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
 6836 tgl                      3159              30 :     if (fd < 0)
 6568 tgl                      3160 UIC           0 :         ereport(ERROR,
                               3161                 :                 (errcode_for_file_access(),
                               3162                 :                  errmsg("could not create file \"%s\": %m", tmppath)));
 6836 tgl                      3163 ECB             : 
                               3164                 :     /*
                               3165                 :      * Do the data copying.
                               3166                 :      */
 2028 andres                   3167 GIC       61470 :     for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
                               3168                 :     {
                               3169                 :         int         nread;
 3034 heikki.linnakangas       3170 ECB             : 
 3034 heikki.linnakangas       3171 CBC       61440 :         nread = upto - nbytes;
 3034 heikki.linnakangas       3172 ECB             : 
                               3173                 :         /*
 2878 bruce                    3174                 :          * The part that is not read from the source file is filled with
                               3175                 :          * zeros.
 3034 heikki.linnakangas       3176 EUB             :          */
 3034 heikki.linnakangas       3177 GBC       61440 :         if (nread < sizeof(buffer))
 1681 tgl                      3178 GIC          30 :             memset(buffer.data, 0, sizeof(buffer));
                               3179                 : 
 3034 heikki.linnakangas       3180           61440 :         if (nread > 0)
                               3181                 :         {
 1726 michael                  3182 EUB             :             int         r;
                               3183                 : 
 3034 heikki.linnakangas       3184 GIC        2603 :             if (nread > sizeof(buffer))
                               3185            2573 :                 nread = sizeof(buffer);
 2213 rhaas                    3186            2603 :             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
 1681 tgl                      3187 CBC        2603 :             r = read(srcfd, buffer.data, nread);
 1726 michael                  3188 GIC        2603 :             if (r != nread)
 3034 heikki.linnakangas       3189 ECB             :             {
 1726 michael                  3190 LBC           0 :                 if (r < 0)
 3034 heikki.linnakangas       3191               0 :                     ereport(ERROR,
                               3192                 :                             (errcode_for_file_access(),
 2893 heikki.linnakangas       3193 EUB             :                              errmsg("could not read file \"%s\": %m",
                               3194                 :                                     path)));
                               3195                 :                 else
 3034 heikki.linnakangas       3196 UIC           0 :                     ereport(ERROR,
                               3197                 :                             (errcode(ERRCODE_DATA_CORRUPTED),
 1721 michael                  3198 EUB             :                              errmsg("could not read file \"%s\": read %d of %zu",
                               3199                 :                                     path, r, (Size) nread)));
 3034 heikki.linnakangas       3200                 :             }
 2213 rhaas                    3201 GIC        2603 :             pgstat_report_wait_end();
 6836 tgl                      3202 EUB             :         }
 6836 tgl                      3203 GIC       61440 :         errno = 0;
 2213 rhaas                    3204           61440 :         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
 1681 tgl                      3205           61440 :         if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
 6836 tgl                      3206 ECB             :         {
 6836 tgl                      3207 UIC           0 :             int         save_errno = errno;
                               3208                 : 
 6836 tgl                      3209 ECB             :             /*
 6385 bruce                    3210                 :              * If we fail to make the file, delete it to release disk space
 6836 tgl                      3211 EUB             :              */
 6836 tgl                      3212 UIC           0 :             unlink(tmppath);
                               3213                 :             /* if write didn't set errno, assume problem is no disk space */
 6836 tgl                      3214 LBC           0 :             errno = save_errno ? save_errno : ENOSPC;
                               3215                 : 
 6568                          3216               0 :             ereport(ERROR,
 6836 tgl                      3217 EUB             :                     (errcode_for_file_access(),
                               3218                 :                      errmsg("could not write to file \"%s\": %m", tmppath)));
                               3219                 :         }
 2213 rhaas                    3220 GIC       61440 :         pgstat_report_wait_end();
 6836 tgl                      3221 ECB             :     }
 6836 tgl                      3222 EUB             : 
 2213 rhaas                    3223 GIC          30 :     pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
 6836 tgl                      3224              30 :     if (pg_fsync(fd) != 0)
 1602 tmunro                   3225 UIC           0 :         ereport(data_sync_elevel(ERROR),
                               3226                 :                 (errcode_for_file_access(),
                               3227                 :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
 2213 rhaas                    3228 GIC          30 :     pgstat_report_wait_end();
 6836 tgl                      3229 ECB             : 
 1373 peter                    3230 GBC          30 :     if (CloseTransientFile(fd) != 0)
 6568 tgl                      3231 LBC           0 :         ereport(ERROR,
                               3232                 :                 (errcode_for_file_access(),
                               3233                 :                  errmsg("could not close file \"%s\": %m", tmppath)));
                               3234                 : 
 1373 peter                    3235 GIC          30 :     if (CloseTransientFile(srcfd) != 0)
 1492 michael                  3236 UIC           0 :         ereport(ERROR,
                               3237                 :                 (errcode_for_file_access(),
                               3238                 :                  errmsg("could not close file \"%s\": %m", path)));
                               3239                 : 
                               3240                 :     /*
                               3241                 :      * Now move the segment into place with its final name.
                               3242                 :      */
  520 rhaas                    3243 GIC          30 :     if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
 2839 fujii                    3244 UIC           0 :         elog(ERROR, "InstallXLogFileSegment should not have failed");
 6836 tgl                      3245 GIC          30 : }
                               3246                 : 
                               3247                 : /*
                               3248                 :  * Install a new XLOG segment file as a current or future log segment.
                               3249                 :  *
                               3250                 :  * This is used both to install a newly-created segment (which has a temp
                               3251                 :  * filename while it's being created) and to recycle an old segment.
                               3252                 :  *
                               3253                 :  * *segno: identify segment to install as (or first possible target).
                               3254                 :  * When find_free is true, this is modified on return to indicate the
                               3255                 :  * actual installation location or last segment searched.
                               3256                 :  *
                               3257                 :  * tmppath: initial name of file to install.  It will be renamed into place.
                               3258                 :  *
                               3259                 :  * find_free: if true, install the new segment at the first empty segno
 2062 peter_e                  3260 ECB             :  * number at or after the passed numbers.  If false, install the new segment
                               3261                 :  * exactly where specified, deleting any existing segment file there.
                               3262                 :  *
                               3263                 :  * max_segno: maximum segment number to install the new file as.  Fail if no
                               3264                 :  * free slot is found between *segno and max_segno. (Ignored when find_free
                               3265                 :  * is false.)
 7934 tgl                      3266                 :  *
                               3267                 :  * tli: The timeline on which the new segment should be installed.
  520 rhaas                    3268                 :  *
                               3269                 :  * Returns true if the file was installed successfully.  false indicates that
  650 noah                     3270                 :  * max_segno limit was exceeded, the startup process has disabled this
                               3271                 :  * function for now, or an error occurred while renaming the file into place.
                               3272                 :  */
 7934 tgl                      3273 EUB             : static bool
 3941 heikki.linnakangas       3274 GBC        1267 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
                               3275                 :                        bool find_free, XLogSegNo max_segno, TimeLineID tli)
                               3276                 : {
 7934 tgl                      3277 ECB             :     char        path[MAXPGPATH];
                               3278                 :     struct stat stat_buf;
                               3279                 : 
  520 rhaas                    3280 CBC        1267 :     Assert(tli != 0);
                               3281                 : 
  520 rhaas                    3282 GIC        1267 :     XLogFilePath(path, tli, *segno, wal_segment_size);
                               3283                 : 
  650 noah                     3284            1267 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  650 noah                     3285 CBC        1267 :     if (!XLogCtl->InstallXLogFileSegmentActive)
                               3286                 :     {
  650 noah                     3287 LBC           0 :         LWLockRelease(ControlFileLock);
  650 noah                     3288 UIC           0 :         return false;
                               3289                 :     }
 8058 tgl                      3290 ECB             : 
 7934 tgl                      3291 CBC        1267 :     if (!find_free)
                               3292                 :     {
 7934 tgl                      3293 ECB             :         /* Force installation: get rid of any pre-existing segment file */
 2204 teodor                   3294 CBC          30 :         durable_unlink(path, DEBUG1);
                               3295                 :     }
                               3296                 :     else
                               3297                 :     {
 7934 tgl                      3298 ECB             :         /* Find a free slot to put it in */
 7379 tgl                      3299 CBC        1643 :         while (stat(path, &stat_buf) == 0)
                               3300                 :         {
 2967 heikki.linnakangas       3301 GBC         426 :             if ((*segno) >= max_segno)
                               3302                 :             {
 7934 tgl                      3303 EUB             :                 /* Failed to find a free slot within specified range */
  650 noah                     3304 GIC          20 :                 LWLockRelease(ControlFileLock);
 7934 tgl                      3305              20 :                 return false;
 7934 tgl                      3306 ECB             :             }
 3941 heikki.linnakangas       3307 GIC         406 :             (*segno)++;
  520 rhaas                    3308 CBC         406 :             XLogFilePath(path, tli, *segno, wal_segment_size);
                               3309                 :         }
                               3310                 :     }
                               3311                 : 
  278 michael                  3312 GNC        1247 :     Assert(access(path, F_OK) != 0 && errno == ENOENT);
                               3313            1247 :     if (durable_rename(tmppath, path, LOG) != 0)
                               3314                 :     {
  650 noah                     3315 UIC           0 :         LWLockRelease(ControlFileLock);
                               3316                 :         /* durable_rename already emitted log message */
 4956 heikki.linnakangas       3317 LBC           0 :         return false;
                               3318                 :     }
 8190 vadim4o                  3319 ECB             : 
  650 noah                     3320 CBC        1247 :     LWLockRelease(ControlFileLock);
 8058 tgl                      3321 ECB             : 
 7934 tgl                      3322 GBC        1247 :     return true;
                               3323                 : }
                               3324                 : 
                               3325                 : /*
 6836 tgl                      3326 ECB             :  * Open a pre-existing logfile segment for writing.
                               3327                 :  */
                               3328                 : int
  520 rhaas                    3329 GIC           7 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
                               3330                 : {
                               3331                 :     char        path[MAXPGPATH];
                               3332                 :     int         fd;
 8595 vadim4o                  3333 ECB             : 
  520 rhaas                    3334 GIC           7 :     XLogFilePath(path, tli, segno, wal_segment_size);
 8595 vadim4o                  3335 ECB             : 
   37 tmunro                   3336 GNC           7 :     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
                               3337               7 :                        get_sync_bit(sync_method));
 8595 vadim4o                  3338 GIC           7 :     if (fd < 0)
 7202 tgl                      3339 UIC           0 :         ereport(PANIC,
                               3340                 :                 (errcode_for_file_access(),
                               3341                 :                  errmsg("could not open file \"%s\": %m", path)));
                               3342                 : 
 6836 tgl                      3343 GIC           7 :     return fd;
                               3344                 : }
 6836 tgl                      3345 ECB             : 
 6142 bruce                    3346                 : /*
                               3347                 :  * Close the current logfile segment for writing.
                               3348                 :  */
                               3349                 : static void
 6142 bruce                    3350 GIC        1705 : XLogFileClose(void)
                               3351                 : {
 6142 bruce                    3352 GBC        1705 :     Assert(openLogFile >= 0);
                               3353                 : 
 6142 bruce                    3354 EUB             :     /*
 6139 tgl                      3355                 :      * WAL segment files will not be re-read in normal operation, so we advise
 3260 bruce                    3356                 :      * the OS to release any cached pages.  But do not do so if WAL archiving
                               3357                 :      * or streaming is active, because archiver and walsender process could
                               3358                 :      * use the cache to read the WAL segment.
                               3359                 :      */
                               3360                 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    1 tmunro                   3361 GNC        1705 :     if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
 5201 tgl                      3362 CBC        1257 :         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 6142 bruce                    3363 ECB             : #endif
                               3364                 : 
 1373 peter                    3365 GIC        1705 :     if (close(openLogFile) != 0)
                               3366                 :     {
                               3367                 :         char        xlogfname[MAXFNAMELEN];
 1223 michael                  3368 UIC           0 :         int         save_errno = errno;
                               3369                 : 
  520 rhaas                    3370               0 :         XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
 1223 michael                  3371               0 :         errno = save_errno;
 6142 bruce                    3372               0 :         ereport(PANIC,
                               3373                 :                 (errcode_for_file_access(),
                               3374                 :                  errmsg("could not close file \"%s\": %m", xlogfname)));
                               3375                 :     }
                               3376                 : 
 6142 bruce                    3377 GIC        1705 :     openLogFile = -1;
 1140 tgl                      3378            1705 :     ReleaseExternalFD();
 6142 bruce                    3379            1705 : }
                               3380                 : 
                               3381                 : /*
                               3382                 :  * Preallocate log files beyond the specified log endpoint.
                               3383                 :  *
                               3384                 :  * XXX this is currently extremely conservative, since it forces only one
 5764 tgl                      3385 ECB             :  * future log segment to exist, and even that only if we are 75% done with
                               3386                 :  * the current one.  This is only appropriate for very low-WAL-volume systems.
                               3387                 :  * High-volume systems will be OK once they've built up a sufficient set of
                               3388                 :  * recycled log segments, but the startup transient is likely to include
                               3389                 :  * a lot of segment creations by foreground processes, which is not so good.
                               3390                 :  *
                               3391                 :  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
                               3392                 :  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
  650 noah                     3393                 :  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
                               3394                 :  * initiated the present checkpoint and an ERROR ends this function, the
                               3395                 :  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
                               3396                 :  * not worth contorting more functions to use caller-specified elevel values.
                               3397                 :  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
                               3398                 :  * reporting and resource reclamation.)
                               3399                 :  */
 5762 tgl                      3400                 : static void
  520 rhaas                    3401 CBC        2538 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
 8062 tgl                      3402 ECB             : {
 3941 heikki.linnakangas       3403                 :     XLogSegNo   _logSegNo;
 8062 tgl                      3404                 :     int         lf;
  650 noah                     3405                 :     bool        added;
                               3406                 :     char        path[MAXPGPATH];
                               3407                 :     uint64      offset;
                               3408                 : 
  650 noah                     3409 GIC        2538 :     if (!XLogCtl->InstallXLogFileSegmentActive)
                               3410               4 :         return;                 /* unlocked check says no */
                               3411                 : 
 2028 andres                   3412            2534 :     XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
                               3413            2534 :     offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
                               3414            2534 :     if (offset >= (uint32) (0.75 * wal_segment_size))
                               3415                 :     {
 3941 heikki.linnakangas       3416              42 :         _logSegNo++;
  520 rhaas                    3417              42 :         lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
  650 noah                     3418              42 :         if (lf >= 0)
                               3419              33 :             close(lf);
                               3420              42 :         if (added)
 5762 tgl                      3421               9 :             CheckpointStats.ckpt_segs_added++;
 8062 tgl                      3422 ECB             :     }
                               3423                 : }
                               3424                 : 
                               3425                 : /*
                               3426                 :  * Throws an error if the given log segment has already been removed or
 3748 heikki.linnakangas       3427                 :  * recycled. The caller should only pass a segment that it knows to have
                               3428                 :  * existed while the server has been running, as this function always
                               3429                 :  * succeeds if no WAL segments have been removed since startup.
                               3430                 :  * 'tli' is only used in the error message.
 1952 tgl                      3431                 :  *
                               3432                 :  * Note: this function guarantees to keep errno unchanged on return.
                               3433                 :  * This supports callers that use this to possibly deliver a better
                               3434                 :  * error message about a missing file, while still being able to throw
 1952 tgl                      3435 EUB             :  * a normal file-access error afterwards, if this does return.
 4745 heikki.linnakangas       3436                 :  */
                               3437                 : void
 3748 heikki.linnakangas       3438 GIC       47944 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
                               3439                 : {
 1952 tgl                      3440           47944 :     int         save_errno = errno;
                               3441                 :     XLogSegNo   lastRemovedSegNo;
 4745 heikki.linnakangas       3442 ECB             : 
 3121 andres                   3443 CBC       47944 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   3444 GIC       47944 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
                               3445           47944 :     SpinLockRelease(&XLogCtl->info_lck);
                               3446                 : 
 3748 heikki.linnakangas       3447           47944 :     if (segno <= lastRemovedSegNo)
                               3448                 :     {
                               3449                 :         char        filename[MAXFNAMELEN];
                               3450                 : 
 2028 andres                   3451 UIC           0 :         XLogFileName(filename, tli, segno, wal_segment_size);
 1952 tgl                      3452               0 :         errno = save_errno;
 3748 heikki.linnakangas       3453 LBC           0 :         ereport(ERROR,
                               3454                 :                 (errcode_for_file_access(),
                               3455                 :                  errmsg("requested WAL segment %s has already been removed",
                               3456                 :                         filename)));
 3748 heikki.linnakangas       3457 ECB             :     }
 1952 tgl                      3458 CBC       47944 :     errno = save_errno;
 4745 heikki.linnakangas       3459           47944 : }
                               3460                 : 
 3324 rhaas                    3461 ECB             : /*
                               3462                 :  * Return the last WAL segment removed, or 0 if no segment has been removed
                               3463                 :  * since startup.
                               3464                 :  *
                               3465                 :  * NB: the result can be out of date arbitrarily fast, the caller has to deal
                               3466                 :  * with that.
                               3467                 :  */
                               3468                 : XLogSegNo
 3324 rhaas                    3469 GIC         745 : XLogGetLastRemovedSegno(void)
 3324 rhaas                    3470 ECB             : {
                               3471                 :     XLogSegNo   lastRemovedSegNo;
                               3472                 : 
 3121 andres                   3473 GIC         745 :     SpinLockAcquire(&XLogCtl->info_lck);
                               3474             745 :     lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
 3121 andres                   3475 CBC         745 :     SpinLockRelease(&XLogCtl->info_lck);
                               3476                 : 
 3324 rhaas                    3477             745 :     return lastRemovedSegNo;
 3324 rhaas                    3478 ECB             : }
                               3479                 : 
 1097 alvherre                 3480                 : 
 4745 heikki.linnakangas       3481                 : /*
                               3482                 :  * Update the last removed segno pointer in shared memory, to reflect that the
                               3483                 :  * given XLOG file has been removed.
                               3484                 :  */
                               3485                 : static void
 4745 heikki.linnakangas       3486 GIC         573 : UpdateLastRemovedPtr(char *filename)
                               3487                 : {
                               3488                 :     uint32      tli;
                               3489                 :     XLogSegNo   segno;
 4745 heikki.linnakangas       3490 ECB             : 
 2028 andres                   3491 GIC         573 :     XLogFromFileName(filename, &tli, &segno, wal_segment_size);
                               3492                 : 
 3121                          3493             573 :     SpinLockAcquire(&XLogCtl->info_lck);
                               3494             573 :     if (segno > XLogCtl->lastRemovedSegNo)
 3121 andres                   3495 CBC         225 :         XLogCtl->lastRemovedSegNo = segno;
 3121 andres                   3496 GIC         573 :     SpinLockRelease(&XLogCtl->info_lck);
 4745 heikki.linnakangas       3497 CBC         573 : }
 4745 heikki.linnakangas       3498 ECB             : 
                               3499                 : /*
                               3500                 :  * Remove all temporary log files in pg_wal
                               3501                 :  *
 1731 michael                  3502                 :  * This is called at the beginning of recovery after a previous crash,
                               3503                 :  * at a point where no other processes write fresh WAL data.
                               3504                 :  */
 1731 michael                  3505 EUB             : static void
 1731 michael                  3506 GBC         131 : RemoveTempXlogFiles(void)
 1731 michael                  3507 EUB             : {
                               3508                 :     DIR        *xldir;
 1731 michael                  3509 ECB             :     struct dirent *xlde;
                               3510                 : 
 1731 michael                  3511 GIC         131 :     elog(DEBUG2, "removing all temporary WAL segments");
                               3512                 : 
                               3513             131 :     xldir = AllocateDir(XLOGDIR);
                               3514             709 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
                               3515                 :     {
                               3516                 :         char        path[MAXPGPATH];
                               3517                 : 
                               3518             578 :         if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
                               3519             578 :             continue;
                               3520                 : 
 1731 michael                  3521 UIC           0 :         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
                               3522               0 :         unlink(path);
 1731 michael                  3523 LBC           0 :         elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
                               3524                 :     }
 1731 michael                  3525 GIC         131 :     FreeDir(xldir);
                               3526             131 : }
                               3527                 : 
                               3528                 : /*
                               3529                 :  * Recycle or remove all log files older or equal to passed segno.
                               3530                 :  *
                               3531                 :  * endptr is current (or recent) end of xlog, and lastredoptr is the
                               3532                 :  * redo pointer of the last checkpoint. These are used to determine
 7934 tgl                      3533 ECB             :  * whether we want to recycle rather than delete no-longer-wanted log files.
  520 rhaas                    3534                 :  *
                               3535                 :  * insertTLI is the current timeline for XLOG insertion. Any recycled
                               3536                 :  * segments should be reused for this timeline.
                               3537                 :  */
                               3538                 : static void
  520 rhaas                    3539 GIC        2363 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
                               3540                 :                    TimeLineID insertTLI)
 8190 vadim4o                  3541 ECB             : {
                               3542                 :     DIR        *xldir;
 8053 bruce                    3543                 :     struct dirent *xlde;
                               3544                 :     char        lastoff[MAXFNAMELEN];
                               3545                 :     XLogSegNo   endlogSegNo;
  814 michael                  3546                 :     XLogSegNo   recycleSegNo;
                               3547                 : 
                               3548                 :     /* Initialize info about where to try to recycle to */
  814 michael                  3549 GIC        2363 :     XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
                               3550            2363 :     recycleSegNo = XLOGfileslop(lastredoptr);
 8190 vadim4o                  3551 ECB             : 
 3762 heikki.linnakangas       3552                 :     /*
                               3553                 :      * Construct a filename of the last segment to be kept. The timeline ID
                               3554                 :      * doesn't matter, we ignore that in the comparison. (During recovery,
                               3555                 :      * InsertTimeLineID isn't set, so we can't use that.)
                               3556                 :      */
 2028 andres                   3557 GIC        2363 :     XLogFileName(lastoff, 0, segno, wal_segment_size);
                               3558                 : 
 4605 simon                    3559            2363 :     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
                               3560                 :          lastoff);
                               3561                 : 
 1952 tgl                      3562            2363 :     xldir = AllocateDir(XLOGDIR);
                               3563                 : 
 6488                          3564           13798 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
                               3565                 :     {
 2918 heikki.linnakangas       3566 ECB             :         /* Ignore files that are not XLOG segments */
 2893 heikki.linnakangas       3567 GIC       11435 :         if (!IsXLogFileName(xlde->d_name) &&
 2893 heikki.linnakangas       3568 CBC        7147 :             !IsPartialXLogFileName(xlde->d_name))
 2918 heikki.linnakangas       3569 GIC        7145 :             continue;
                               3570                 : 
 6838 tgl                      3571 ECB             :         /*
                               3572                 :          * We ignore the timeline part of the XLOG segment identifiers in
 3260 bruce                    3573                 :          * deciding whether a segment is still needed.  This ensures that we
                               3574                 :          * won't prematurely remove a segment from a parent timeline. We could
                               3575                 :          * probably be a little more proactive about removing segments of
                               3576                 :          * non-parent timelines, but that would be a whole lot more
                               3577                 :          * complicated.
 6836 tgl                      3578                 :          *
 6385 bruce                    3579                 :          * We use the alphanumeric sorting property of the filenames to decide
                               3580                 :          * which ones are earlier than the lastoff segment.
                               3581                 :          */
 2918 heikki.linnakangas       3582 GIC        4290 :         if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
                               3583                 :         {
 3705                          3584             579 :             if (XLogArchiveCheckDone(xlde->d_name))
                               3585                 :             {
                               3586                 :                 /* Update the last removed location in shared memory first */
 4745                          3587             573 :                 UpdateLastRemovedPtr(xlde->d_name);
                               3588                 : 
  219 michael                  3589 GNC         573 :                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
                               3590                 :             }
                               3591                 :         }
                               3592                 :     }
                               3593                 : 
 2918 heikki.linnakangas       3594 GIC        2363 :     FreeDir(xldir);
                               3595            2363 : }
 2918 heikki.linnakangas       3596 ECB             : 
                               3597                 : /*
                               3598                 :  * Remove WAL files that are not part of the given timeline's history.
                               3599                 :  *
                               3600                 :  * This is called during recovery, whenever we switch to follow a new
                               3601                 :  * timeline, and at the end of recovery when we create a new timeline. We
                               3602                 :  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
                               3603                 :  * might be leftover pre-allocated or recycled WAL segments on the old timeline
                               3604                 :  * that we haven't used yet, and contain garbage. If we just leave them in
                               3605                 :  * pg_wal, they will eventually be archived, and we can't let that happen.
                               3606                 :  * Files that belong to our timeline history are valid, because we have
                               3607                 :  * successfully replayed them, but from others we can't be sure.
                               3608                 :  *
                               3609                 :  * 'switchpoint' is the current point in WAL where we switch to new timeline,
                               3610                 :  * and 'newTLI' is the new timeline we switch to.
                               3611                 :  */
                               3612                 : void
 2918 heikki.linnakangas       3613 GIC          48 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
                               3614                 : {
                               3615                 :     DIR        *xldir;
 2918 heikki.linnakangas       3616 ECB             :     struct dirent *xlde;
                               3617                 :     char        switchseg[MAXFNAMELEN];
                               3618                 :     XLogSegNo   endLogSegNo;
                               3619                 :     XLogSegNo   switchLogSegNo;
                               3620                 :     XLogSegNo   recycleSegNo;
                               3621                 : 
                               3622                 :     /*
  814 michael                  3623                 :      * Initialize info about where to begin the work.  This will recycle,
                               3624                 :      * somewhat arbitrarily, 10 future segments.
                               3625                 :      */
  814 michael                  3626 CBC          48 :     XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
                               3627              48 :     XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
  814 michael                  3628 GIC          48 :     recycleSegNo = endLogSegNo + 10;
                               3629                 : 
                               3630                 :     /*
                               3631                 :      * Construct a filename of the last segment to be kept.
                               3632                 :      */
                               3633              48 :     XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
 4959 heikki.linnakangas       3634 ECB             : 
 2918 heikki.linnakangas       3635 CBC          48 :     elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
                               3636                 :          switchseg);
                               3637                 : 
 1952 tgl                      3638 GIC          48 :     xldir = AllocateDir(XLOGDIR);
                               3639                 : 
 2918 heikki.linnakangas       3640             408 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
                               3641                 :     {
                               3642                 :         /* Ignore files that are not XLOG segments */
 2893 heikki.linnakangas       3643 CBC         360 :         if (!IsXLogFileName(xlde->d_name))
 2918                          3644             208 :             continue;
                               3645                 : 
                               3646                 :         /*
                               3647                 :          * Remove files that are on a timeline older than the new one we're
 2878 bruce                    3648 ECB             :          * switching to, but with a segment number >= the first segment on the
                               3649                 :          * new timeline.
                               3650                 :          */
 2918 heikki.linnakangas       3651 GIC         152 :         if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
                               3652              98 :             strcmp(xlde->d_name + 8, switchseg + 8) > 0)
                               3653                 :         {
                               3654                 :             /*
                               3655                 :              * If the file has already been marked as .ready, however, don't
                               3656                 :              * remove it yet. It should be OK to remove it - files that are
                               3657                 :              * not part of our timeline history are not required for recovery
                               3658                 :              * - but seems safer to let them be archived and removed later.
                               3659                 :              */
                               3660              12 :             if (!XLogArchiveIsReady(xlde->d_name))
  219 michael                  3661 GNC          12 :                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
                               3662                 :         }
                               3663                 :     }
 2918 heikki.linnakangas       3664 ECB             : 
 2918 heikki.linnakangas       3665 GIC          48 :     FreeDir(xldir);
                               3666              48 : }
                               3667                 : 
                               3668                 : /*
                               3669                 :  * Recycle or remove a log file that's no longer needed.
                               3670                 :  *
                               3671                 :  * segment_de is the dirent structure of the segment to recycle or remove.
                               3672                 :  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
                               3673                 :  * the segment number of the current (or recent) end of WAL.
  814 michael                  3674 ECB             :  *
                               3675                 :  * endlogSegNo gets incremented if the segment is recycled so as it is not
                               3676                 :  * checked again with future callers of this function.
                               3677                 :  *
                               3678                 :  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
                               3679                 :  * should be used for this timeline.
                               3680                 :  */
 2918 heikki.linnakangas       3681                 : static void
  219 michael                  3682 GNC         585 : RemoveXlogFile(const struct dirent *segment_de,
                               3683                 :                XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
                               3684                 :                TimeLineID insertTLI)
 2918 heikki.linnakangas       3685 ECB             : {
                               3686                 :     char        path[MAXPGPATH];
                               3687                 : #ifdef WIN32
                               3688                 :     char        newpath[MAXPGPATH];
                               3689                 : #endif
  219 michael                  3690 GNC         585 :     const char *segname = segment_de->d_name;
                               3691                 : 
 2918 heikki.linnakangas       3692 CBC         585 :     snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
                               3693                 : 
 2918 heikki.linnakangas       3694 ECB             :     /*
                               3695                 :      * Before deleting the file, see if it can be recycled as a future log
                               3696                 :      * segment. Only recycle normal files, because we don't want to recycle
                               3697                 :      * symbolic links pointing to a separate archive directory.
                               3698                 :      */
 1468 tmunro                   3699 GIC         585 :     if (wal_recycle &&
  814 michael                  3700             585 :         *endlogSegNo <= recycleSegNo &&
  650 noah                     3701 CBC        1111 :         XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
  219 michael                  3702 GNC        1106 :         get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
  814 michael                  3703 GIC         553 :         InstallXLogFileSegment(endlogSegNo, path,
                               3704                 :                                true, recycleSegNo, insertTLI))
                               3705                 :     {
 2918 heikki.linnakangas       3706             533 :         ereport(DEBUG2,
                               3707                 :                 (errmsg_internal("recycled write-ahead log file \"%s\"",
                               3708                 :                                  segname)));
                               3709             533 :         CheckpointStats.ckpt_segs_recycled++;
                               3710                 :         /* Needn't recheck that slot on future iterations */
  814 michael                  3711             533 :         (*endlogSegNo)++;
                               3712                 :     }
                               3713                 :     else
                               3714                 :     {
                               3715                 :         /* No need for any more future segments, or recycling failed ... */
                               3716                 :         int         rc;
                               3717                 : 
 2918 heikki.linnakangas       3718              52 :         ereport(DEBUG2,
                               3719                 :                 (errmsg_internal("removing write-ahead log file \"%s\"",
                               3720                 :                                  segname)));
                               3721                 : 
                               3722                 : #ifdef WIN32
                               3723                 : 
                               3724                 :         /*
                               3725                 :          * On Windows, if another process (e.g another backend) holds the file
                               3726                 :          * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
                               3727                 :          * will still show up in directory listing until the last handle is
 2878 bruce                    3728 ECB             :          * closed. To avoid confusing the lingering deleted file for a live
                               3729                 :          * WAL file that needs to be archived, rename it before deleting it.
 2918 heikki.linnakangas       3730                 :          *
                               3731                 :          * If another process holds the file open without FILE_SHARE_DELETE
                               3732                 :          * flag, rename will fail. We'll try again at the next checkpoint.
 2918 heikki.linnakangas       3733 EUB             :          */
                               3734                 :         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
 2918 heikki.linnakangas       3735 ECB             :         if (rename(path, newpath) != 0)
                               3736                 :         {
                               3737                 :             ereport(LOG,
                               3738                 :                     (errcode_for_file_access(),
                               3739                 :                      errmsg("could not rename file \"%s\": %m",
                               3740                 :                             path)));
                               3741                 :             return;
                               3742                 :         }
                               3743                 :         rc = durable_unlink(newpath, LOG);
                               3744                 : #else
 2204 teodor                   3745 GIC          52 :         rc = durable_unlink(path, LOG);
                               3746                 : #endif
 2918 heikki.linnakangas       3747              52 :         if (rc != 0)
                               3748                 :         {
                               3749                 :             /* Message already logged by durable_unlink() */
 2918 heikki.linnakangas       3750 UIC           0 :             return;
                               3751                 :         }
 2918 heikki.linnakangas       3752 GIC          52 :         CheckpointStats.ckpt_segs_removed++;
                               3753                 :     }
                               3754                 : 
 2918 heikki.linnakangas       3755 CBC         585 :     XLogArchiveCleanup(segname);
                               3756                 : }
                               3757                 : 
                               3758                 : /*
                               3759                 :  * Verify whether pg_wal and pg_wal/archive_status exist.
                               3760                 :  * If the latter does not exist, recreate it.
 5264 tgl                      3761 ECB             :  *
                               3762                 :  * It is not the goal of this function to verify the contents of these
 5264 tgl                      3763 EUB             :  * directories, but to help in cases where someone has performed a cluster
                               3764                 :  * copy for PITR purposes but omitted pg_wal from the copy.
                               3765                 :  *
                               3766                 :  * We could also recreate pg_wal if it doesn't exist, but a deliberate
                               3767                 :  * policy decision was made not to.  It is fairly common for pg_wal to be
 5264 tgl                      3768 ECB             :  * a symlink, and if that was the DBA's intent then automatically making a
                               3769                 :  * plain directory would result in degraded performance with no notice.
                               3770                 :  */
                               3771                 : static void
 5264 tgl                      3772 CBC        1176 : ValidateXLOGDirectoryStructure(void)
 5264 tgl                      3773 EUB             : {
                               3774                 :     char        path[MAXPGPATH];
                               3775                 :     struct stat stat_buf;
                               3776                 : 
                               3777                 :     /* Check for pg_wal; if it doesn't exist, error out */
 5264 tgl                      3778 GIC        1176 :     if (stat(XLOGDIR, &stat_buf) != 0 ||
 5264 tgl                      3779 GBC        1176 :         !S_ISDIR(stat_buf.st_mode))
 5050 bruce                    3780 UIC           0 :         ereport(FATAL,
 5264 tgl                      3781 EUB             :                 (errmsg("required WAL directory \"%s\" does not exist",
                               3782                 :                         XLOGDIR)));
                               3783                 : 
                               3784                 :     /* Check for archive_status */
 5264 tgl                      3785 GIC        1176 :     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
 5264 tgl                      3786 CBC        1176 :     if (stat(path, &stat_buf) == 0)
                               3787                 :     {
                               3788                 :         /* Check for weird cases where it exists but isn't a directory */
 5264 tgl                      3789 GIC        1176 :         if (!S_ISDIR(stat_buf.st_mode))
 5050 bruce                    3790 UIC           0 :             ereport(FATAL,
                               3791                 :                     (errmsg("required WAL directory \"%s\" does not exist",
                               3792                 :                             path)));
                               3793                 :     }
 5264 tgl                      3794 ECB             :     else
                               3795                 :     {
 5264 tgl                      3796 UIC           0 :         ereport(LOG,
                               3797                 :                 (errmsg("creating missing WAL directory \"%s\"", path)));
 1828 sfrost                   3798               0 :         if (MakePGDirectory(path) < 0)
 5050 bruce                    3799               0 :             ereport(FATAL,
 5264 tgl                      3800 ECB             :                     (errmsg("could not create missing directory \"%s\": %m",
                               3801                 :                             path)));
                               3802                 :     }
 5264 tgl                      3803 GIC        1176 : }
 5264 tgl                      3804 ECB             : 
                               3805                 : /*
 6135                          3806                 :  * Remove previous backup history files.  This also retries creation of
                               3807                 :  * .ready files for any backup history files for which XLogArchiveNotify
                               3808                 :  * failed earlier.
                               3809                 :  */
 6507 bruce                    3810                 : static void
 6135 tgl                      3811 CBC         117 : CleanupBackupHistory(void)
 6507 bruce                    3812 ECB             : {
                               3813                 :     DIR        *xldir;
                               3814                 :     struct dirent *xlde;
                               3815                 :     char        path[MAXPGPATH + sizeof(XLOGDIR)];
                               3816                 : 
 6488 tgl                      3817 CBC         117 :     xldir = AllocateDir(XLOGDIR);
 6507 bruce                    3818 ECB             : 
 6488 tgl                      3819 GIC        1047 :     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
                               3820                 :     {
 2893 heikki.linnakangas       3821             813 :         if (IsBackupHistoryFileName(xlde->d_name))
                               3822                 :         {
 5326 tgl                      3823             118 :             if (XLogArchiveCheckDone(xlde->d_name))
                               3824                 :             {
 2158 peter_e                  3825             108 :                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
                               3826                 :                      xlde->d_name);
 2189                          3827             108 :                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
 6507 bruce                    3828             108 :                 unlink(path);
                               3829             108 :                 XLogArchiveCleanup(xlde->d_name);
                               3830                 :             }
                               3831                 :         }
                               3832                 :     }
                               3833                 : 
                               3834             117 :     FreeDir(xldir);
                               3835             117 : }
                               3836                 : 
 8062 tgl                      3837 ECB             : /*
                               3838                 :  * I/O routines for pg_control
                               3839                 :  *
                               3840                 :  * *ControlFile is a buffer in shared memory that holds an image of the
                               3841                 :  * contents of pg_control.  WriteControlFile() initializes pg_control
                               3842                 :  * given a preloaded buffer, ReadControlFile() loads the buffer from
                               3843                 :  * the pg_control file (during postmaster or standalone-backend startup),
                               3844                 :  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
                               3845                 :  * InitControlFile() fills the buffer with initial values.
                               3846                 :  *
  417 heikki.linnakangas       3847                 :  * For simplicity, WriteControlFile() initializes the fields of pg_control
  417 heikki.linnakangas       3848 EUB             :  * that are related to checking backend/database compatibility, and
                               3849                 :  * ReadControlFile() verifies they are correct.  We could split out the
                               3850                 :  * I/O and compatibility-check functions, but there seems no need currently.
                               3851                 :  */
  699 tmunro                   3852 ECB             : 
                               3853                 : static void
  417 heikki.linnakangas       3854 CBC         305 : InitControlFile(uint64 sysidentifier)
  417 heikki.linnakangas       3855 ECB             : {
                               3856                 :     char        mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
 8595 vadim4o                  3857                 : 
                               3858                 :     /*
                               3859                 :      * Generate a random nonce. This is used for authentication requests that
  417 heikki.linnakangas       3860                 :      * will fail because the user does not exist. The nonce is used to create
                               3861                 :      * a genuine-looking password challenge for the non-existent user, in lieu
                               3862                 :      * of an actual stored password.
                               3863                 :      */
  417 heikki.linnakangas       3864 CBC         305 :     if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
  417 heikki.linnakangas       3865 LBC           0 :         ereport(PANIC,
  417 heikki.linnakangas       3866 ECB             :                 (errcode(ERRCODE_INTERNAL_ERROR),
                               3867                 :                  errmsg("could not generate secret authorization token")));
 8595 vadim4o                  3868                 : 
  417 heikki.linnakangas       3869 CBC         305 :     memset(ControlFile, 0, sizeof(ControlFileData));
                               3870                 :     /* Initialize pg_control status fields */
  417 heikki.linnakangas       3871 GIC         305 :     ControlFile->system_identifier = sysidentifier;
  417 heikki.linnakangas       3872 CBC         305 :     memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
  417 heikki.linnakangas       3873 GIC         305 :     ControlFile->state = DB_SHUTDOWNED;
                               3874             305 :     ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
                               3875                 : 
                               3876                 :     /* Set important parameter values for use when replaying WAL */
 1147 peter                    3877             305 :     ControlFile->MaxConnections = MaxConnections;
                               3878             305 :     ControlFile->max_worker_processes = max_worker_processes;
                               3879             305 :     ControlFile->max_wal_senders = max_wal_senders;
 1147 peter                    3880 CBC         305 :     ControlFile->max_prepared_xacts = max_prepared_xacts;
                               3881             305 :     ControlFile->max_locks_per_xact = max_locks_per_xact;
 1147 peter                    3882 GIC         305 :     ControlFile->wal_level = wal_level;
 1147 peter                    3883 CBC         305 :     ControlFile->wal_log_hints = wal_log_hints;
                               3884             305 :     ControlFile->track_commit_timestamp = track_commit_timestamp;
 1147 peter                    3885 GIC         305 :     ControlFile->data_checksum_version = bootstrap_data_checksum_version;
 1147 peter                    3886 CBC         305 : }
 1147 peter                    3887 ECB             : 
 8170 tgl                      3888                 : static void
 8170 tgl                      3889 CBC         305 : WriteControlFile(void)
                               3890                 : {
 8170 tgl                      3891 ECB             :     int         fd;
 2090                          3892                 :     char        buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
                               3893                 : 
                               3894                 :     /*
 8062                          3895                 :      * Initialize version and compatibility-check fields
                               3896                 :      */
 8062 tgl                      3897 GIC         305 :     ControlFile->pg_control_version = PG_CONTROL_VERSION;
                               3898             305 :     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
                               3899                 : 
 6397                          3900             305 :     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
                               3901             305 :     ControlFile->floatFormat = FLOATFORMAT_VALUE;
                               3902                 : 
 8170                          3903             305 :     ControlFile->blcksz = BLCKSZ;
 8170 tgl                      3904 CBC         305 :     ControlFile->relseg_size = RELSEG_SIZE;
 6215                          3905             305 :     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
 2028 andres                   3906 GIC         305 :     ControlFile->xlog_seg_size = wal_segment_size;
 7658 lockhart                 3907 ECB             : 
 7658 lockhart                 3908 GIC         305 :     ControlFile->nameDataLen = NAMEDATALEN;
 6585 tgl                      3909 CBC         305 :     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
 7658 lockhart                 3910 EUB             : 
 5850 tgl                      3911 GIC         305 :     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
 3230                          3912             305 :     ControlFile->loblksize = LOBLKSIZE;
                               3913                 : 
 5466                          3914             305 :     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
 7658 lockhart                 3915 ECB             : 
 8062 tgl                      3916                 :     /* Contents are protected with a CRC */
 3078 heikki.linnakangas       3917 CBC         305 :     INIT_CRC32C(ControlFile->crc);
 3078 heikki.linnakangas       3918 GIC         305 :     COMP_CRC32C(ControlFile->crc,
                               3919                 :                 (char *) ControlFile,
 3078 heikki.linnakangas       3920 EUB             :                 offsetof(ControlFileData, crc));
 3078 heikki.linnakangas       3921 GBC         305 :     FIN_CRC32C(ControlFile->crc);
 8062 tgl                      3922 EUB             : 
                               3923                 :     /*
                               3924                 :      * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
                               3925                 :      * the excess over sizeof(ControlFileData).  This reduces the odds of
                               3926                 :      * premature-EOF errors when reading pg_control.  We'll still fail when we
 6214 tgl                      3927 ECB             :      * check the contents of the file, but hopefully with a more specific
                               3928                 :      * error than "couldn't read pg_control".
 8170                          3929                 :      */
 2090 tgl                      3930 CBC         305 :     memset(buffer, 0, PG_CONTROL_FILE_SIZE);
 8170 tgl                      3931 GBC         305 :     memcpy(buffer, ControlFile, sizeof(ControlFileData));
                               3932                 : 
 6488 tgl                      3933 GIC         305 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
                               3934                 :                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
 8170 tgl                      3935 CBC         305 :     if (fd < 0)
 7202 tgl                      3936 UIC           0 :         ereport(PANIC,
 7202 tgl                      3937 ECB             :                 (errcode_for_file_access(),
 1721 michael                  3938 EUB             :                  errmsg("could not create file \"%s\": %m",
                               3939                 :                         XLOG_CONTROL_FILE)));
                               3940                 : 
 7977 tgl                      3941 GIC         305 :     errno = 0;
 2213 rhaas                    3942 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
 2090 tgl                      3943 GIC         305 :     if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
                               3944                 :     {
 7977 tgl                      3945 ECB             :         /* if write didn't set errno, assume problem is no disk space */
 7977 tgl                      3946 UIC           0 :         if (errno == 0)
                               3947               0 :             errno = ENOSPC;
 7202                          3948               0 :         ereport(PANIC,
                               3949                 :                 (errcode_for_file_access(),
                               3950                 :                  errmsg("could not write to file \"%s\": %m",
                               3951                 :                         XLOG_CONTROL_FILE)));
                               3952                 :     }
 2213 rhaas                    3953 GIC         305 :     pgstat_report_wait_end();
                               3954                 : 
 2213 rhaas                    3955 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
 8157 tgl                      3956 GIC         305 :     if (pg_fsync(fd) != 0)
 7202 tgl                      3957 LBC           0 :         ereport(PANIC,
 7202 tgl                      3958 EUB             :                 (errcode_for_file_access(),
                               3959                 :                  errmsg("could not fsync file \"%s\": %m",
                               3960                 :                         XLOG_CONTROL_FILE)));
 2213 rhaas                    3961 GIC         305 :     pgstat_report_wait_end();
                               3962                 : 
 1373 peter                    3963 CBC         305 :     if (close(fd) != 0)
 7013 tgl                      3964 LBC           0 :         ereport(PANIC,
 7013 tgl                      3965 ECB             :                 (errcode_for_file_access(),
                               3966                 :                  errmsg("could not close file \"%s\": %m",
 1721 michael                  3967 EUB             :                         XLOG_CONTROL_FILE)));
 8170 tgl                      3968 GBC         305 : }
                               3969                 : 
                               3970                 : static void
 8170 tgl                      3971 GIC        1222 : ReadControlFile(void)
                               3972                 : {
 2917 heikki.linnakangas       3973 EUB             :     pg_crc32c   crc;
                               3974                 :     int         fd;
                               3975                 :     static char wal_segsz_str[20];
                               3976                 :     int         r;
                               3977                 : 
 8170 tgl                      3978 ECB             :     /*
                               3979                 :      * Read data...
                               3980                 :      */
 6488 tgl                      3981 GIC        1222 :     fd = BasicOpenFile(XLOG_CONTROL_FILE,
                               3982                 :                        O_RDWR | PG_BINARY);
 8170                          3983            1222 :     if (fd < 0)
 7202 tgl                      3984 UIC           0 :         ereport(PANIC,
                               3985                 :                 (errcode_for_file_access(),
                               3986                 :                  errmsg("could not open file \"%s\": %m",
                               3987                 :                         XLOG_CONTROL_FILE)));
                               3988                 : 
 2213 rhaas                    3989 CBC        1222 :     pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
 1787 magnus                   3990 GBC        1222 :     r = read(fd, ControlFile, sizeof(ControlFileData));
 1787 magnus                   3991 GIC        1222 :     if (r != sizeof(ControlFileData))
                               3992                 :     {
 1787 magnus                   3993 UIC           0 :         if (r < 0)
                               3994               0 :             ereport(PANIC,
                               3995                 :                     (errcode_for_file_access(),
                               3996                 :                      errmsg("could not read file \"%s\": %m",
                               3997                 :                             XLOG_CONTROL_FILE)));
 1787 magnus                   3998 ECB             :         else
 1787 magnus                   3999 UBC           0 :             ereport(PANIC,
                               4000                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
                               4001                 :                      errmsg("could not read file \"%s\": read %d of %zu",
                               4002                 :                             XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
                               4003                 :     }
 2213 rhaas                    4004 GIC        1222 :     pgstat_report_wait_end();
                               4005                 : 
 8170 tgl                      4006            1222 :     close(fd);
 8170 tgl                      4007 ECB             : 
 8062                          4008                 :     /*
                               4009                 :      * Check for expected pg_control format version.  If this is wrong, the
                               4010                 :      * CRC check will likely fail because we'll be checking the wrong number
 6385 bruce                    4011                 :      * of bytes.  Complaining about wrong version will probably be more
                               4012                 :      * enlightening than complaining about wrong CRC.
 8062 tgl                      4013                 :      */
 5557 peter_e                  4014 EUB             : 
 5557 peter_e                  4015 GIC        1222 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
 5557 peter_e                  4016 UIC           0 :         ereport(FATAL,
                               4017                 :                 (errmsg("database files are incompatible with server"),
                               4018                 :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
                               4019                 :                            " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
                               4020                 :                            ControlFile->pg_control_version, ControlFile->pg_control_version,
                               4021                 :                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
 5557 peter_e                  4022 ECB             :                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
 5557 peter_e                  4023 EUB             : 
 8062 tgl                      4024 GIC        1222 :     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
 7202 tgl                      4025 UIC           0 :         ereport(FATAL,
                               4026                 :                 (errmsg("database files are incompatible with server"),
                               4027                 :                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
                               4028                 :                            " but the server was compiled with PG_CONTROL_VERSION %d.",
 2118 tgl                      4029 ECB             :                            ControlFile->pg_control_version, PG_CONTROL_VERSION),
 7202 tgl                      4030 EUB             :                  errhint("It looks like you need to initdb.")));
                               4031                 : 
                               4032                 :     /* Now check the CRC. */
 3078 heikki.linnakangas       4033 GIC        1222 :     INIT_CRC32C(crc);
                               4034            1222 :     COMP_CRC32C(crc,
                               4035                 :                 (char *) ControlFile,
 3078 heikki.linnakangas       4036 ECB             :                 offsetof(ControlFileData, crc));
 3078 heikki.linnakangas       4037 GBC        1222 :     FIN_CRC32C(crc);
                               4038                 : 
 3078 heikki.linnakangas       4039 GIC        1222 :     if (!EQ_CRC32C(crc, ControlFile->crc))
 7202 tgl                      4040 UIC           0 :         ereport(FATAL,
 7136 peter_e                  4041 ECB             :                 (errmsg("incorrect checksum in control file")));
 8137 vadim4o                  4042 EUB             : 
                               4043                 :     /*
                               4044                 :      * Do compatibility checking immediately.  If the database isn't
                               4045                 :      * compatible with the backend executable, we want to abort before we can
                               4046                 :      * possibly do any damage.
                               4047                 :      */
 8062 tgl                      4048 CBC        1222 :     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
 7202 tgl                      4049 UBC           0 :         ereport(FATAL,
                               4050                 :                 (errmsg("database files are incompatible with server"),
                               4051                 :                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
                               4052                 :                            " but the server was compiled with CATALOG_VERSION_NO %d.",
                               4053                 :                            ControlFile->catalog_version_no, CATALOG_VERSION_NO),
                               4054                 :                  errhint("It looks like you need to initdb.")));
 6397 tgl                      4055 CBC        1222 :     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
 6397 tgl                      4056 UBC           0 :         ereport(FATAL,
                               4057                 :                 (errmsg("database files are incompatible with server"),
                               4058                 :                  errdetail("The database cluster was initialized with MAXALIGN %d,"
                               4059                 :                            " but the server was compiled with MAXALIGN %d.",
                               4060                 :                            ControlFile->maxAlign, MAXIMUM_ALIGNOF),
                               4061                 :                  errhint("It looks like you need to initdb.")));
 6397 tgl                      4062 CBC        1222 :     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
 6397 tgl                      4063 UBC           0 :         ereport(FATAL,
                               4064                 :                 (errmsg("database files are incompatible with server"),
                               4065                 :                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
                               4066                 :                  errhint("It looks like you need to initdb.")));
 8170 tgl                      4067 GIC        1222 :     if (ControlFile->blcksz != BLCKSZ)
 7202 tgl                      4068 UIC           0 :         ereport(FATAL,
 7202 tgl                      4069 ECB             :                 (errmsg("database files are incompatible with server"),
 2118 tgl                      4070 EUB             :                  errdetail("The database cluster was initialized with BLCKSZ %d,"
                               4071                 :                            " but the server was compiled with BLCKSZ %d.",
                               4072                 :                            ControlFile->blcksz, BLCKSZ),
                               4073                 :                  errhint("It looks like you need to recompile or initdb.")));
 8170 tgl                      4074 GIC        1222 :     if (ControlFile->relseg_size != RELSEG_SIZE)
 7202 tgl                      4075 UIC           0 :         ereport(FATAL,
 7202 tgl                      4076 ECB             :                 (errmsg("database files are incompatible with server"),
 2118 tgl                      4077 EUB             :                  errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
                               4078                 :                            " but the server was compiled with RELSEG_SIZE %d.",
                               4079                 :                            ControlFile->relseg_size, RELSEG_SIZE),
                               4080                 :                  errhint("It looks like you need to recompile or initdb.")));
 6215 tgl                      4081 GIC        1222 :     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
 6215 tgl                      4082 UIC           0 :         ereport(FATAL,
 6215 tgl                      4083 ECB             :                 (errmsg("database files are incompatible with server"),
 2118 tgl                      4084 EUB             :                  errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
                               4085                 :                            " but the server was compiled with XLOG_BLCKSZ %d.",
                               4086                 :                            ControlFile->xlog_blcksz, XLOG_BLCKSZ),
                               4087                 :                  errhint("It looks like you need to recompile or initdb.")));
 7658 lockhart                 4088 GIC        1222 :     if (ControlFile->nameDataLen != NAMEDATALEN)
 7202 tgl                      4089 UIC           0 :         ereport(FATAL,
                               4090                 :                 (errmsg("database files are incompatible with server"),
                               4091                 :                  errdetail("The database cluster was initialized with NAMEDATALEN %d,"
 2118 tgl                      4092 ECB             :                            " but the server was compiled with NAMEDATALEN %d.",
 2118 tgl                      4093 EUB             :                            ControlFile->nameDataLen, NAMEDATALEN),
                               4094                 :                  errhint("It looks like you need to recompile or initdb.")));
 6585 tgl                      4095 GIC        1222 :     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
 7202 tgl                      4096 UIC           0 :         ereport(FATAL,
                               4097                 :                 (errmsg("database files are incompatible with server"),
                               4098                 :                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
                               4099                 :                            " but the server was compiled with INDEX_MAX_KEYS %d.",
                               4100                 :                            ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
                               4101                 :                  errhint("It looks like you need to recompile or initdb.")));
 5850 tgl                      4102 GIC        1222 :     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
 5850 tgl                      4103 UIC           0 :         ereport(FATAL,
                               4104                 :                 (errmsg("database files are incompatible with server"),
                               4105                 :                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
                               4106                 :                            " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
 2118 tgl                      4107 ECB             :                            ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
                               4108                 :                  errhint("It looks like you need to recompile or initdb.")));
 3230 tgl                      4109 CBC        1222 :     if (ControlFile->loblksize != LOBLKSIZE)
 3230 tgl                      4110 UBC           0 :         ereport(FATAL,
                               4111                 :                 (errmsg("database files are incompatible with server"),
                               4112                 :                  errdetail("The database cluster was initialized with LOBLKSIZE %d,"
                               4113                 :                            " but the server was compiled with LOBLKSIZE %d.",
                               4114                 :                            ControlFile->loblksize, (int) LOBLKSIZE),
                               4115                 :                  errhint("It looks like you need to recompile or initdb.")));
 7658 lockhart                 4116 ECB             : 
 5466 tgl                      4117                 : #ifdef USE_FLOAT8_BYVAL
 5466 tgl                      4118 GIC        1222 :     if (ControlFile->float8ByVal != true)
 5466 tgl                      4119 UIC           0 :         ereport(FATAL,
                               4120                 :                 (errmsg("database files are incompatible with server"),
 5466 tgl                      4121 ECB             :                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
 2118 tgl                      4122 EUB             :                            " but the server was compiled with USE_FLOAT8_BYVAL."),
                               4123                 :                  errhint("It looks like you need to recompile or initdb.")));
                               4124                 : #else
 5466 tgl                      4125 ECB             :     if (ControlFile->float8ByVal != false)
 5466 tgl                      4126 EUB             :         ereport(FATAL,
                               4127                 :                 (errmsg("database files are incompatible with server"),
                               4128                 :                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
 2118 tgl                      4129 ECB             :                            " but the server was compiled without USE_FLOAT8_BYVAL."),
 5466                          4130                 :                  errhint("It looks like you need to recompile or initdb.")));
                               4131                 : #endif
                               4132                 : 
 2028 andres                   4133 CBC        1222 :     wal_segment_size = ControlFile->xlog_seg_size;
                               4134                 : 
 2028 andres                   4135 GIC        1222 :     if (!IsValidWalSegSize(wal_segment_size))
 2028 andres                   4136 LBC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               4137                 :                         errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
 1788 peter_e                  4138 ECB             :                                       "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
                               4139                 :                                       wal_segment_size,
                               4140                 :                                       wal_segment_size)));
                               4141                 : 
 2028 andres                   4142 GIC        1222 :     snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
                               4143            1222 :     SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
                               4144                 :                     PGC_S_DYNAMIC_DEFAULT);
 2028 andres                   4145 ECB             : 
                               4146                 :     /* check and update variables dependent on wal_segment_size */
 2028 andres                   4147 CBC        1222 :     if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
 2028 andres                   4148 LBC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               4149                 :                         errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
                               4150                 : 
 2028 andres                   4151 GIC        1222 :     if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
 2028 andres                   4152 UIC           0 :         ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               4153                 :                         errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
 2028 andres                   4154 ECB             : 
 2028 andres                   4155 GIC        1222 :     UsableBytesInSegment =
 2028 andres                   4156 CBC        1222 :         (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
 2028 andres                   4157 ECB             :         (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
                               4158                 : 
 2028 andres                   4159 GIC        1222 :     CalculateCheckpointSegments();
                               4160                 : 
                               4161                 :     /* Make the initdb settings visible as GUC variables, too */
 1826 magnus                   4162            1222 :     SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
                               4163                 :                     PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
 8170 tgl                      4164 CBC        1222 : }
                               4165                 : 
 1483 michael                  4166 ECB             : /*
                               4167                 :  * Utility wrapper to update the control file.  Note that the control
                               4168                 :  * file gets flushed.
                               4169                 :  */
                               4170                 : static void
 8170 tgl                      4171 GIC       10909 : UpdateControlFile(void)
                               4172                 : {
 1469 peter                    4173           10909 :     update_controlfile(DataDir, ControlFile, true);
 8586 vadim4o                  4174 CBC       10909 : }
                               4175                 : 
 4832 heikki.linnakangas       4176 ECB             : /*
                               4177                 :  * Returns the unique system identifier from control file.
                               4178                 :  */
                               4179                 : uint64
 4832 heikki.linnakangas       4180 GIC         926 : GetSystemIdentifier(void)
                               4181                 : {
                               4182             926 :     Assert(ControlFile != NULL);
                               4183             926 :     return ControlFile->system_identifier;
                               4184                 : }
                               4185                 : 
                               4186                 : /*
                               4187                 :  * Returns the random nonce from control file.
                               4188                 :  */
                               4189                 : char *
 2224 heikki.linnakangas       4190 CBC           1 : GetMockAuthenticationNonce(void)
                               4191                 : {
 2224 heikki.linnakangas       4192 GIC           1 :     Assert(ControlFile != NULL);
                               4193               1 :     return ControlFile->mock_authentication_nonce;
                               4194                 : }
 2224 heikki.linnakangas       4195 ECB             : 
 3670 simon                    4196                 : /*
                               4197                 :  * Are checksums enabled for data pages?
                               4198                 :  */
                               4199                 : bool
 1826 magnus                   4200 GIC    14333955 : DataChecksumsEnabled(void)
                               4201                 : {
 3670 simon                    4202        14333955 :     Assert(ControlFile != NULL);
 3631                          4203        14333955 :     return (ControlFile->data_checksum_version > 0);
                               4204                 : }
                               4205                 : 
                               4206                 : /*
                               4207                 :  * Returns a fake LSN for unlogged relations.
                               4208                 :  *
                               4209                 :  * Each call generates an LSN that is greater than any previous value
                               4210                 :  * returned. The current counter value is saved and restored across clean
                               4211                 :  * shutdowns, but like unlogged relations, does not survive a crash. This can
                               4212                 :  * be used in lieu of real LSN values returned by XLogInsert, if you need an
                               4213                 :  * LSN-like increasing sequence of numbers without writing any WAL.
 3709 heikki.linnakangas       4214 ECB             :  */
                               4215                 : XLogRecPtr
 3709 heikki.linnakangas       4216 GIC          33 : GetFakeLSNForUnloggedRel(void)
                               4217                 : {
 3602 bruce                    4218 ECB             :     XLogRecPtr  nextUnloggedLSN;
 3709 heikki.linnakangas       4219                 : 
                               4220                 :     /* increment the unloggedLSN counter, need SpinLock */
 3121 andres                   4221 CBC          33 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
                               4222              33 :     nextUnloggedLSN = XLogCtl->unloggedLSN++;
                               4223              33 :     SpinLockRelease(&XLogCtl->ulsn_lck);
                               4224                 : 
 3709 heikki.linnakangas       4225 GIC          33 :     return nextUnloggedLSN;
                               4226                 : }
                               4227                 : 
                               4228                 : /*
                               4229                 :  * Auto-tune the number of XLOG buffers.
 4460 tgl                      4230 ECB             :  *
                               4231                 :  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
                               4232                 :  * a maximum of one XLOG segment (there is little reason to think that more
                               4233                 :  * is helpful, at least so long as we force an fsync when switching log files)
                               4234                 :  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
 4385                          4235                 :  * 9.1, when auto-tuning was added).
                               4236                 :  *
                               4237                 :  * This should not be called until NBuffers has received its final value.
                               4238                 :  */
                               4239                 : static int
 4385 tgl                      4240 GIC        1825 : XLOGChooseNumBuffers(void)
 4460 tgl                      4241 ECB             : {
 4385                          4242                 :     int         xbuffers;
                               4243                 : 
 4385 tgl                      4244 GIC        1825 :     xbuffers = NBuffers / 32;
 2028 andres                   4245 GBC        1825 :     if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
 2028 andres                   4246 GIC          18 :         xbuffers = (wal_segment_size / XLOG_BLCKSZ);
 4385 tgl                      4247            1825 :     if (xbuffers < 8)
                               4248             327 :         xbuffers = 8;
                               4249            1825 :     return xbuffers;
                               4250                 : }
                               4251                 : 
                               4252                 : /*
                               4253                 :  * GUC check_hook for wal_buffers
                               4254                 :  */
 4385 tgl                      4255 ECB             : bool
 4385 tgl                      4256 GBC        3682 : check_wal_buffers(int *newval, void **extra, GucSource source)
                               4257                 : {
 4385 tgl                      4258 ECB             :     /*
                               4259                 :      * -1 indicates a request for auto-tune.
                               4260                 :      */
 4385 tgl                      4261 GIC        3682 :     if (*newval == -1)
                               4262                 :     {
                               4263                 :         /*
                               4264                 :          * If we haven't yet changed the boot_val default of -1, just let it
 3260 bruce                    4265 ECB             :          * be.  We'll fix it when XLOGShmemSize is called.
                               4266                 :          */
 4385 tgl                      4267 GIC        1857 :         if (XLOGbuffers == -1)
                               4268            1857 :             return true;
                               4269                 : 
                               4270                 :         /* Otherwise, substitute the auto-tune value */
 4385 tgl                      4271 UIC           0 :         *newval = XLOGChooseNumBuffers();
                               4272                 :     }
 4385 tgl                      4273 ECB             : 
                               4274                 :     /*
                               4275                 :      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
                               4276                 :      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
                               4277                 :      * the case, we just silently treat such values as a request for the
                               4278                 :      * minimum.  (We could throw an error instead, but that doesn't seem very
                               4279                 :      * helpful.)
                               4280                 :      */
 4385 tgl                      4281 GIC        1825 :     if (*newval < 4)
 4385 tgl                      4282 UBC           0 :         *newval = 4;
 4385 tgl                      4283 EUB             : 
 4385 tgl                      4284 GBC        1825 :     return true;
 4460 tgl                      4285 EUB             : }
                               4286                 : 
                               4287                 : /*
                               4288                 :  * GUC check_hook for wal_consistency_checking
                               4289                 :  */
                               4290                 : bool
  208 tgl                      4291 GNC        1859 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
                               4292                 : {
                               4293                 :     char       *rawstring;
                               4294                 :     List       *elemlist;
                               4295                 :     ListCell   *l;
                               4296                 :     bool        newwalconsistency[RM_MAX_ID + 1];
                               4297                 : 
                               4298                 :     /* Initialize the array */
                               4299           61347 :     MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
                               4300                 : 
                               4301                 :     /* Need a modifiable copy of string */
                               4302            1859 :     rawstring = pstrdup(*newval);
                               4303                 : 
                               4304                 :     /* Parse string into list of identifiers */
                               4305            1859 :     if (!SplitIdentifierString(rawstring, ',', &elemlist))
                               4306                 :     {
                               4307                 :         /* syntax error in list */
  208 tgl                      4308 UNC           0 :         GUC_check_errdetail("List syntax is invalid.");
                               4309               0 :         pfree(rawstring);
                               4310               0 :         list_free(elemlist);
                               4311               0 :         return false;
                               4312                 :     }
                               4313                 : 
  208 tgl                      4314 GNC        1861 :     foreach(l, elemlist)
                               4315                 :     {
                               4316               2 :         char       *tok = (char *) lfirst(l);
                               4317                 :         int         rmid;
                               4318                 : 
                               4319                 :         /* Check for 'all'. */
                               4320               2 :         if (pg_strcasecmp(tok, "all") == 0)
                               4321                 :         {
  208 tgl                      4322 UNC           0 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
                               4323               0 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
                               4324               0 :                     newwalconsistency[rmid] = true;
                               4325                 :         }
                               4326                 :         else
                               4327                 :         {
                               4328                 :             /* Check if the token matches any known resource manager. */
  208 tgl                      4329 GNC           2 :             bool        found = false;
                               4330                 : 
                               4331              36 :             for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
                               4332                 :             {
                               4333              54 :                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
                               4334              18 :                     pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
                               4335                 :                 {
                               4336               2 :                     newwalconsistency[rmid] = true;
                               4337               2 :                     found = true;
                               4338               2 :                     break;
                               4339                 :                 }
                               4340                 :             }
                               4341               2 :             if (!found)
                               4342                 :             {
                               4343                 :                 /*
                               4344                 :                  * During startup, it might be a not-yet-loaded custom
                               4345                 :                  * resource manager.  Defer checking until
                               4346                 :                  * InitializeWalConsistencyChecking().
                               4347                 :                  */
  208 tgl                      4348 UNC           0 :                 if (!process_shared_preload_libraries_done)
                               4349                 :                 {
                               4350               0 :                     check_wal_consistency_checking_deferred = true;
                               4351                 :                 }
                               4352                 :                 else
                               4353                 :                 {
                               4354               0 :                     GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
                               4355               0 :                     pfree(rawstring);
                               4356               0 :                     list_free(elemlist);
                               4357               0 :                     return false;
                               4358                 :                 }
                               4359                 :             }
                               4360                 :         }
                               4361                 :     }
                               4362                 : 
  208 tgl                      4363 GNC        1859 :     pfree(rawstring);
                               4364            1859 :     list_free(elemlist);
                               4365                 : 
                               4366                 :     /* assign new value */
                               4367            1859 :     *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
                               4368            1859 :     memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
                               4369            1859 :     return true;
                               4370                 : }
                               4371                 : 
                               4372                 : /*
                               4373                 :  * GUC assign_hook for wal_consistency_checking
                               4374                 :  */
                               4375                 : void
                               4376            1859 : assign_wal_consistency_checking(const char *newval, void *extra)
                               4377                 : {
                               4378                 :     /*
                               4379                 :      * If some checks were deferred, it's possible that the checks will fail
                               4380                 :      * later during InitializeWalConsistencyChecking(). But in that case, the
                               4381                 :      * postmaster will exit anyway, so it's safe to proceed with the
                               4382                 :      * assignment.
                               4383                 :      *
                               4384                 :      * Any built-in resource managers specified are assigned immediately,
                               4385                 :      * which affects WAL created before shared_preload_libraries are
                               4386                 :      * processed. Any custom resource managers specified won't be assigned
                               4387                 :      * until after shared_preload_libraries are processed, but that's OK
                               4388                 :      * because WAL for a custom resource manager can't be written before the
                               4389                 :      * module is loaded anyway.
                               4390                 :      */
                               4391            1859 :     wal_consistency_checking = extra;
                               4392            1859 : }
                               4393                 : 
                               4394                 : /*
                               4395                 :  * InitializeWalConsistencyChecking: run after loading custom resource managers
                               4396                 :  *
                               4397                 :  * If any unknown resource managers were specified in the
                               4398                 :  * wal_consistency_checking GUC, processing was deferred.  Now that
                               4399                 :  * shared_preload_libraries have been loaded, process wal_consistency_checking
                               4400                 :  * again.
                               4401                 :  */
                               4402                 : void
                               4403             910 : InitializeWalConsistencyChecking(void)
                               4404                 : {
                               4405             910 :     Assert(process_shared_preload_libraries_done);
                               4406                 : 
                               4407             910 :     if (check_wal_consistency_checking_deferred)
                               4408                 :     {
                               4409                 :         struct config_generic *guc;
                               4410                 : 
  208 tgl                      4411 UNC           0 :         guc = find_option("wal_consistency_checking", false, false, ERROR);
                               4412                 : 
                               4413               0 :         check_wal_consistency_checking_deferred = false;
                               4414                 : 
                               4415               0 :         set_config_option_ext("wal_consistency_checking",
                               4416                 :                               wal_consistency_checking_string,
                               4417                 :                               guc->scontext, guc->source, guc->srole,
                               4418                 :                               GUC_ACTION_SET, true, ERROR, false);
                               4419                 : 
                               4420                 :         /* checking should not be deferred again */
                               4421               0 :         Assert(!check_wal_consistency_checking_deferred);
                               4422                 :     }
  208 tgl                      4423 GNC         910 : }
                               4424                 : 
                               4425                 : /*
                               4426                 :  * GUC show_hook for archive_command
                               4427                 :  */
                               4428                 : const char *
                               4429            1088 : show_archive_command(void)
                               4430                 : {
                               4431            1088 :     if (XLogArchivingActive())
  208 tgl                      4432 UNC           0 :         return XLogArchiveCommand;
                               4433                 :     else
  208 tgl                      4434 GNC        1088 :         return "(disabled)";
                               4435                 : }
                               4436                 : 
                               4437                 : /*
                               4438                 :  * GUC show_hook for in_hot_standby
                               4439                 :  */
                               4440                 : const char *
                               4441            9962 : show_in_hot_standby(void)
                               4442                 : {
                               4443                 :     /*
                               4444                 :      * We display the actual state based on shared memory, so that this GUC
                               4445                 :      * reports up-to-date state if examined intra-query.  The underlying
                               4446                 :      * variable (in_hot_standby_guc) changes only when we transmit a new value
                               4447                 :      * to the client.
                               4448                 :      */
                               4449            9962 :     return RecoveryInProgress() ? "on" : "off";
                               4450                 : }
                               4451                 : 
                               4452                 : /*
 2034 andres                   4453 ECB             :  * Read the control file, set respective GUCs.
                               4454                 :  *
 2030                          4455                 :  * This is to be called during startup, including a crash recovery cycle,
                               4456                 :  * unless in bootstrap mode, where no control file yet exists.  As there's no
                               4457                 :  * usable shared memory yet (its sizing can depend on the contents of the
                               4458                 :  * control file!), first store the contents in local memory. XLOGShmemInit()
                               4459                 :  * will then copy it to shared memory later.
                               4460                 :  *
 2030 andres                   4461 EUB             :  * reset just controls whether previous contents are to be expected (in the
                               4462                 :  * reset case, there's a dangling pointer into old shared memory), or not.
 2034                          4463                 :  */
                               4464                 : void
 2030 andres                   4465 GIC         917 : LocalProcessControlFile(bool reset)
                               4466                 : {
                               4467             917 :     Assert(reset || ControlFile == NULL);
 2034 andres                   4468 CBC         917 :     ControlFile = palloc(sizeof(ControlFileData));
 2034 andres                   4469 GIC         917 :     ReadControlFile();
 2034 andres                   4470 CBC         917 : }
                               4471                 : 
                               4472                 : /*
                               4473                 :  * Get the wal_level from the control file. For a standby, this value should be
                               4474                 :  * considered as its active wal_level, because it may be different from what
                               4475                 :  * was originally configured on standby.
                               4476                 :  */
                               4477                 : WalLevel
    1 andres                   4478 GNC          61 : GetActiveWalLevelOnStandby(void)
                               4479                 : {
                               4480              61 :     return ControlFile->wal_level;
                               4481                 : }
                               4482                 : 
 8170 tgl                      4483 ECB             : /*
 8062                          4484                 :  * Initialization of shared memory for XLOG
                               4485                 :  */
 6441                          4486                 : Size
 8174 peter_e                  4487 CBC        4564 : XLOGShmemSize(void)
 8586 vadim4o                  4488 ECB             : {
                               4489                 :     Size        size;
                               4490                 : 
 4385 tgl                      4491                 :     /*
                               4492                 :      * If the value of wal_buffers is -1, use the preferred auto-tune value.
                               4493                 :      * This isn't an amazingly clean place to do this, but we must wait till
                               4494                 :      * NBuffers has received its final value, and must do it before using the
                               4495                 :      * value of XLOGbuffers to do anything important.
                               4496                 :      *
                               4497                 :      * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
  305 tgl                      4498 EUB             :      * However, if the DBA explicitly set wal_buffers = -1 in the config file,
                               4499                 :      * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
                               4500                 :      * the matter with PGC_S_OVERRIDE.
                               4501                 :      */
 4385 tgl                      4502 GIC        4564 :     if (XLOGbuffers == -1)
                               4503                 :     {
 4385 tgl                      4504 EUB             :         char        buf[32];
                               4505                 : 
 4385 tgl                      4506 GBC        1825 :         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
  305                          4507            1825 :         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
                               4508                 :                         PGC_S_DYNAMIC_DEFAULT);
  305 tgl                      4509 GIC        1825 :         if (XLOGbuffers == -1)  /* failed to apply it? */
  305 tgl                      4510 UIC           0 :             SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
                               4511                 :                             PGC_S_OVERRIDE);
                               4512                 :     }
 4460 tgl                      4513 CBC        4564 :     Assert(XLOGbuffers > 0);
 4460 tgl                      4514 ECB             : 
                               4515                 :     /* XLogCtl */
 6441 tgl                      4516 GIC        4564 :     size = sizeof(XLogCtlData);
 3562 heikki.linnakangas       4517 ECB             : 
 3306                          4518                 :     /* WAL insertion locks, plus alignment */
 3112 heikki.linnakangas       4519 CBC        4564 :     size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
                               4520                 :     /* xlblocks array */
 6441 tgl                      4521 GIC        4564 :     size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
                               4522                 :     /* extra alignment padding for XLOG I/O buffers */
    1 tmunro                   4523 GNC        4564 :     size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
                               4524                 :     /* and the buffers themselves */
 6215 tgl                      4525 GIC        4564 :     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
 6441 tgl                      4526 ECB             : 
                               4527                 :     /*
                               4528                 :      * Note: we don't count ControlFileData, it comes out of the "slop factor"
                               4529                 :      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
                               4530                 :      * routine again below to compute the actual allocation size.
                               4531                 :      */
                               4532                 : 
 6441 tgl                      4533 GIC        4564 :     return size;
                               4534                 : }
                               4535                 : 
                               4536                 : void
 8586 vadim4o                  4537            1826 : XLOGShmemInit(void)
                               4538                 : {
                               4539                 :     bool        foundCFile,
                               4540                 :                 foundXLog;
 6441 tgl                      4541 ECB             :     char       *allocptr;
 3562 heikki.linnakangas       4542                 :     int         i;
                               4543                 :     ControlFileData *localControlFile;
                               4544                 : 
                               4545                 : #ifdef WAL_DEBUG
                               4546                 : 
                               4547                 :     /*
                               4548                 :      * Create a memory context for WAL debugging that's exempt from the normal
                               4549                 :      * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
                               4550                 :      * an allocation fails, but wal_debug is not for production use anyway.
                               4551                 :      */
                               4552                 :     if (walDebugCxt == NULL)
 3205                          4553                 :     {
                               4554                 :         walDebugCxt = AllocSetContextCreate(TopMemoryContext,
                               4555                 :                                             "WAL Debug",
                               4556                 :                                             ALLOCSET_DEFAULT_SIZES);
                               4557                 :         MemoryContextAllowInCriticalSection(walDebugCxt, true);
                               4558                 :     }
                               4559                 : #endif
                               4560                 : 
 2030 andres                   4561 EUB             : 
 2030 andres                   4562 GIC        1826 :     XLogCtl = (XLogCtlData *)
 2030 andres                   4563 GBC        1826 :         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
                               4564                 : 
 2034                          4565            1826 :     localControlFile = ControlFile;
 8170 tgl                      4566 GIC        1826 :     ControlFile = (ControlFileData *)
 7050 bruce                    4567            1826 :         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
                               4568                 : 
 6439 tgl                      4569            1826 :     if (foundCFile || foundXLog)
                               4570                 :     {
 7050 bruce                    4571 EUB             :         /* both should be present or neither */
 6439 tgl                      4572 UIC           0 :         Assert(foundCFile && foundXLog);
 3181 rhaas                    4573 ECB             : 
                               4574                 :         /* Initialize local copy of WALInsertLocks */
 3181 rhaas                    4575 UIC           0 :         WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
                               4576                 : 
 2030 andres                   4577               0 :         if (localControlFile)
                               4578               0 :             pfree(localControlFile);
 7050 bruce                    4579 LBC           0 :         return;
                               4580                 :     }
 8062 tgl                      4581 CBC        1826 :     memset(XLogCtl, 0, sizeof(XLogCtlData));
 8053 bruce                    4582 EUB             : 
                               4583                 :     /*
 2030 andres                   4584 ECB             :      * Already have read control file locally, unless in bootstrap mode. Move
                               4585                 :      * contents into shared memory.
                               4586                 :      */
 2030 andres                   4587 GIC        1826 :     if (localControlFile)
                               4588                 :     {
                               4589             911 :         memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
                               4590             911 :         pfree(localControlFile);
 2030 andres                   4591 ECB             :     }
                               4592                 : 
                               4593                 :     /*
                               4594                 :      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
                               4595                 :      * multiple of the alignment for same, so no extra alignment padding is
                               4596                 :      * needed here.
                               4597                 :      */
 3562 heikki.linnakangas       4598 GIC        1826 :     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
 6441 tgl                      4599 CBC        1826 :     XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
 8062 tgl                      4600 GIC        1826 :     memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
 6441                          4601            1826 :     allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
                               4602                 : 
                               4603                 : 
                               4604                 :     /* WAL insertion locks. Ensure they're aligned to the full padded size */
 3306 heikki.linnakangas       4605            1826 :     allocptr += sizeof(WALInsertLockPadded) -
 2118 tgl                      4606            1826 :         ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
 3306 heikki.linnakangas       4607            1826 :     WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
                               4608                 :         (WALInsertLockPadded *) allocptr;
 3112                          4609            1826 :     allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
                               4610                 : 
                               4611           16434 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
                               4612                 :     {
 2672 rhaas                    4613           14608 :         LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
 3306 heikki.linnakangas       4614           14608 :         WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
 2299 andres                   4615 CBC       14608 :         WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
                               4616                 :     }
 3562 heikki.linnakangas       4617 ECB             : 
 8062 tgl                      4618                 :     /*
 3562 heikki.linnakangas       4619                 :      * Align the start of the page buffers to a full xlog block size boundary.
 3260 bruce                    4620                 :      * This simplifies some calculations in XLOG insertion. It is also
                               4621                 :      * required for O_DIRECT.
                               4622                 :      */
 3562 heikki.linnakangas       4623 GIC        1826 :     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
 6441 tgl                      4624            1826 :     XLogCtl->pages = allocptr;
 6215                          4625            1826 :     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
                               4626                 : 
                               4627                 :     /*
 6385 bruce                    4628 ECB             :      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
                               4629                 :      * in additional info.)
 8062 tgl                      4630                 :      */
 8062 tgl                      4631 GIC        1826 :     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
 1080 michael                  4632            1826 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
  650 noah                     4633            1826 :     XLogCtl->InstallXLogFileSegmentActive = false;
 3988 tgl                      4634            1826 :     XLogCtl->WalWriterSleeping = false;
                               4635                 : 
 3562 heikki.linnakangas       4636            1826 :     SpinLockInit(&XLogCtl->Insert.insertpos_lck);
 7862 tgl                      4637 CBC        1826 :     SpinLockInit(&XLogCtl->info_lck);
 3709 heikki.linnakangas       4638 GIC        1826 :     SpinLockInit(&XLogCtl->ulsn_lck);
                               4639                 : }
                               4640                 : 
                               4641                 : /*
                               4642                 :  * This func must be called ONCE on system install.  It creates pg_control
                               4643                 :  * and the initial XLOG segment.
                               4644                 :  */
                               4645                 : void
 8062 tgl                      4646             305 : BootStrapXLOG(void)
                               4647                 : {
                               4648                 :     CheckPoint  checkPoint;
                               4649                 :     char       *buffer;
                               4650                 :     XLogPageHeader page;
                               4651                 :     XLogLongPageHeader longpage;
 8397 bruce                    4652 ECB             :     XLogRecord *record;
                               4653                 :     char       *recptr;
                               4654                 :     uint64      sysidentifier;
                               4655                 :     struct timeval tv;
 2917 heikki.linnakangas       4656                 :     pg_crc32c   crc;
 8586 vadim4o                  4657                 : 
                               4658                 :     /* allow ordinary WAL segment creation, like StartupXLOG() would */
  235 michael                  4659 GNC         305 :     SetInstallXLogFileSegmentActive();
                               4660                 : 
 6997 tgl                      4661 ECB             :     /*
                               4662                 :      * Select a hopefully-unique system identifier code for this installation.
                               4663                 :      * We use the result of gettimeofday(), including the fractional seconds
 6385 bruce                    4664                 :      * field, as being about as unique as we can easily get.  (Think not to
                               4665                 :      * use random(), since it hasn't been seeded and there's no portable way
                               4666                 :      * to seed it other than the system clock value...)  The upper half of the
 3270 tgl                      4667                 :      * uint64 value is just the tv_sec part, while the lower half contains the
                               4668                 :      * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
                               4669                 :      * PID for a little extra uniqueness.  A person knowing this encoding can
                               4670                 :      * determine the initialization time of the installation, which could
                               4671                 :      * perhaps be useful sometimes.
                               4672                 :      */
 6997 tgl                      4673 CBC         305 :     gettimeofday(&tv, NULL);
 6997 tgl                      4674 GIC         305 :     sysidentifier = ((uint64) tv.tv_sec) << 32;
 3270                          4675             305 :     sysidentifier |= ((uint64) tv.tv_usec) << 12;
                               4676             305 :     sysidentifier |= getpid() & 0xFFF;
                               4677                 : 
                               4678                 :     /* page buffer must be aligned suitably for O_DIRECT */
 3562 heikki.linnakangas       4679             305 :     buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
                               4680             305 :     page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
 6215 tgl                      4681 CBC         305 :     memset(page, 0, XLOG_BLCKSZ);
                               4682                 : 
                               4683                 :     /*
                               4684                 :      * Set up information for the initial checkpoint record
 4541 heikki.linnakangas       4685 ECB             :      *
                               4686                 :      * The initial checkpoint record is written to the beginning of the WAL
                               4687                 :      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
                               4688                 :      * used, so that we can use 0/0 to mean "before any valid WAL segment".
                               4689                 :      */
 2028 andres                   4690 GIC         305 :     checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
  520 rhaas                    4691             305 :     checkPoint.ThisTimeLineID = BootstrapTimeLineID;
                               4692             305 :     checkPoint.PrevTimeLineID = BootstrapTimeLineID;
 4092 simon                    4693             305 :     checkPoint.fullPageWrites = fullPageWrites;
                               4694                 :     checkPoint.nextXid =
 1473 tmunro                   4695             305 :         FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
  633 tgl                      4696             305 :     checkPoint.nextOid = FirstGenbkiObjectId;
 6555                          4697             305 :     checkPoint.nextMulti = FirstMultiXactId;
 6514                          4698             305 :     checkPoint.nextMultiOffset = 0;
 4969                          4699             305 :     checkPoint.oldestXid = FirstNormalTransactionId;
  353                          4700             305 :     checkPoint.oldestXidDB = Template1DbOid;
 3728 alvherre                 4701             305 :     checkPoint.oldestMulti = FirstMultiXactId;
  353 tgl                      4702             305 :     checkPoint.oldestMultiDB = Template1DbOid;
 2659 mail                     4703             305 :     checkPoint.oldestCommitTsXid = InvalidTransactionId;
                               4704             305 :     checkPoint.newestCommitTsXid = InvalidTransactionId;
 5530 tgl                      4705             305 :     checkPoint.time = (pg_time_t) time(NULL);
 4859 simon                    4706             305 :     checkPoint.oldestActiveXid = InvalidTransactionId;
                               4707                 : 
  971 andres                   4708             305 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
 8174 vadim4o                  4709             305 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
 8174 vadim4o                  4710 CBC         305 :     ShmemVariableCache->oidCount = 0;
 6514 tgl                      4711             305 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 2208 rhaas                    4712 GIC         305 :     AdvanceOldestClogXid(checkPoint.oldestXid);
 4799 tgl                      4713 CBC         305 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 2217                          4714             305 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
 3049 alvherre                 4715             305 :     SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
                               4716                 : 
 6997 tgl                      4717 ECB             :     /* Set up the XLOG page header */
 8586 vadim4o                  4718 GIC         305 :     page->xlp_magic = XLOG_PAGE_MAGIC;
 6836 tgl                      4719             305 :     page->xlp_info = XLP_LONG_HEADER;
  520 rhaas                    4720 GBC         305 :     page->xlp_tli = BootstrapTimeLineID;
 2028 andres                   4721 GIC         305 :     page->xlp_pageaddr = wal_segment_size;
 6836 tgl                      4722             305 :     longpage = (XLogLongPageHeader) page;
 6836 tgl                      4723 GBC         305 :     longpage->xlp_sysid = sysidentifier;
 2028 andres                   4724 GIC         305 :     longpage->xlp_seg_size = wal_segment_size;
 6213 tgl                      4725 GBC         305 :     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
 6997 tgl                      4726 EUB             : 
                               4727                 :     /* Insert the initial checkpoint record */
 3062 heikki.linnakangas       4728 GIC         305 :     recptr = ((char *) page + SizeOfXLogLongPHD);
 3062 heikki.linnakangas       4729 CBC         305 :     record = (XLogRecord *) recptr;
 3941 heikki.linnakangas       4730 GIC         305 :     record->xl_prev = 0;
 8586 vadim4o                  4731             305 :     record->xl_xid = InvalidTransactionId;
 3062 heikki.linnakangas       4732             305 :     record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
 8062 tgl                      4733             305 :     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
 8586 vadim4o                  4734             305 :     record->xl_rmid = RM_XLOG_ID;
 3062 heikki.linnakangas       4735 CBC         305 :     recptr += SizeOfXLogRecord;
                               4736                 :     /* fill the XLogRecordDataHeaderShort struct */
 2203 tgl                      4737             305 :     *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
 3062 heikki.linnakangas       4738             305 :     *(recptr++) = sizeof(checkPoint);
 3062 heikki.linnakangas       4739 GIC         305 :     memcpy(recptr, &checkPoint, sizeof(checkPoint));
                               4740             305 :     recptr += sizeof(checkPoint);
                               4741             305 :     Assert(recptr - (char *) record == record->xl_tot_len);
                               4742                 : 
 3078                          4743             305 :     INIT_CRC32C(crc);
 3062                          4744             305 :     COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
 3078                          4745             305 :     COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
 3078 heikki.linnakangas       4746 CBC         305 :     FIN_CRC32C(crc);
 8137 vadim4o                  4747             305 :     record->xl_crc = crc;
 8137 vadim4o                  4748 ECB             : 
 6997 tgl                      4749                 :     /* Create first XLOG segment file */
  520 rhaas                    4750 GIC         305 :     openLogTLI = BootstrapTimeLineID;
                               4751             305 :     openLogFile = XLogFileInit(1, BootstrapTimeLineID);
                               4752                 : 
 1140 tgl                      4753 ECB             :     /*
                               4754                 :      * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
                               4755                 :      * close the file again in a moment.
                               4756                 :      */
                               4757                 : 
                               4758                 :     /* Write the first page with the initial record */
 7977 tgl                      4759 CBC         305 :     errno = 0;
 2213 rhaas                    4760 GIC         305 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
 6215 tgl                      4761 CBC         305 :     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
 7977 tgl                      4762 ECB             :     {
                               4763                 :         /* if write didn't set errno, assume problem is no disk space */
 7977 tgl                      4764 UIC           0 :         if (errno == 0)
                               4765               0 :             errno = ENOSPC;
 7202                          4766               0 :         ereport(PANIC,
                               4767                 :                 (errcode_for_file_access(),
                               4768                 :                  errmsg("could not write bootstrap write-ahead log file: %m")));
                               4769                 :     }
 2213 rhaas                    4770 GIC         305 :     pgstat_report_wait_end();
 8586 vadim4o                  4771 ECB             : 
 2213 rhaas                    4772 CBC         305 :     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
 8062 tgl                      4773             305 :     if (pg_fsync(openLogFile) != 0)
 7202 tgl                      4774 UIC           0 :         ereport(PANIC,
                               4775                 :                 (errcode_for_file_access(),
                               4776                 :                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
 2213 rhaas                    4777 GIC         305 :     pgstat_report_wait_end();
                               4778                 : 
 1373 peter                    4779 CBC         305 :     if (close(openLogFile) != 0)
 7013 tgl                      4780 LBC           0 :         ereport(PANIC,
 7013 tgl                      4781 ECB             :                 (errcode_for_file_access(),
 2118                          4782                 :                  errmsg("could not close bootstrap write-ahead log file: %m")));
                               4783                 : 
 8062 tgl                      4784 CBC         305 :     openLogFile = -1;
 8586 vadim4o                  4785 ECB             : 
 6997 tgl                      4786                 :     /* Now create pg_control */
 1147 peter                    4787 GIC         305 :     InitControlFile(sysidentifier);
 8062 tgl                      4788             305 :     ControlFile->time = checkPoint.time;
 8586 vadim4o                  4789             305 :     ControlFile->checkPoint = checkPoint.redo;
 8062 tgl                      4790             305 :     ControlFile->checkPointCopy = checkPoint;
                               4791                 : 
                               4792                 :     /* some additional ControlFile fields are set in WriteControlFile() */
 8170                          4793             305 :     WriteControlFile();
 7897 tgl                      4794 ECB             : 
                               4795                 :     /* Bootstrap the commit log, too */
 7897 tgl                      4796 GIC         305 :     BootStrapCLOG();
 3049 alvherre                 4797             305 :     BootStrapCommitTs();
 6856 tgl                      4798             305 :     BootStrapSUBTRANS();
 6555                          4799             305 :     BootStrapMultiXact();
                               4800                 : 
 6441                          4801             305 :     pfree(buffer);
                               4802                 : 
                               4803                 :     /*
                               4804                 :      * Force control file to be read - in contrast to normal processing we'd
                               4805                 :      * otherwise never run the checks and GUC related initializations therein.
                               4806                 :      */
 2034 andres                   4807 CBC         305 :     ReadControlFile();
 8586 vadim4o                  4808 GIC         305 : }
                               4809                 : 
                               4810                 : static char *
 5727 tgl                      4811             566 : str_time(pg_time_t tnow)
                               4812                 : {
                               4813                 :     static char buf[128];
                               4814                 : 
                               4815             566 :     pg_strftime(buf, sizeof(buf),
                               4816                 :                 "%Y-%m-%d %H:%M:%S %Z",
                               4817             566 :                 pg_localtime(&tnow, log_timezone));
                               4818                 : 
 8174 peter_e                  4819             566 :     return buf;
                               4820                 : }
 8586 vadim4o                  4821 ECB             : 
 6838 tgl                      4822                 : /*
  417 heikki.linnakangas       4823                 :  * Initialize the first WAL segment on new timeline.
 6838 tgl                      4824                 :  */
                               4825                 : static void
  417 heikki.linnakangas       4826 GIC          39 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
 6838 tgl                      4827 ECB             : {
 3090 fujii                    4828                 :     char        xlogfname[MAXFNAMELEN];
 3034 heikki.linnakangas       4829                 :     XLogSegNo   endLogSegNo;
                               4830                 :     XLogSegNo   startLogSegNo;
                               4831                 : 
                               4832                 :     /* we always switch to a new timeline after archive recovery */
  520 rhaas                    4833 GIC          39 :     Assert(endTLI != newTLI);
                               4834                 : 
                               4835                 :     /*
                               4836                 :      * Update min recovery point one last time.
                               4837                 :      */
 5036 heikki.linnakangas       4838 CBC          39 :     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
 5036 heikki.linnakangas       4839 ECB             : 
 6838 tgl                      4840                 :     /*
 3034 heikki.linnakangas       4841                 :      * Calculate the last segment on the old timeline, and the first segment
                               4842                 :      * on the new timeline. If the switch happens in the middle of a segment,
                               4843                 :      * they are the same, but if the switch happens exactly at a segment
                               4844                 :      * boundary, startLogSegNo will be endLogSegNo + 1.
                               4845                 :      */
 2028 andres                   4846 CBC          39 :     XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
                               4847              39 :     XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
 3034 heikki.linnakangas       4848 ECB             : 
                               4849                 :     /*
                               4850                 :      * Initialize the starting WAL segment for the new timeline. If the switch
                               4851                 :      * happens in the middle of a segment, copy data from the last WAL segment
                               4852                 :      * of the old timeline up to the switch point, to the starting WAL segment
                               4853                 :      * on the new timeline.
 6838 tgl                      4854                 :      */
 3034 heikki.linnakangas       4855 GIC          39 :     if (endLogSegNo == startLogSegNo)
 6838 tgl                      4856 ECB             :     {
 2893 heikki.linnakangas       4857                 :         /*
                               4858                 :          * Make a copy of the file on the new timeline.
                               4859                 :          *
                               4860                 :          * Writing WAL isn't allowed yet, so there are no locking
                               4861                 :          * considerations. But we should be just as tense as XLogFileInit to
                               4862                 :          * avoid emplacing a bogus file.
                               4863                 :          */
  520 rhaas                    4864 GIC          30 :         XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
 2028 andres                   4865              30 :                      XLogSegmentOffset(endOfLog, wal_segment_size));
 6838 tgl                      4866 ECB             :     }
 3034 heikki.linnakangas       4867                 :     else
                               4868                 :     {
 2893                          4869                 :         /*
                               4870                 :          * The switch happened at a segment boundary, so just create the next
                               4871                 :          * segment on the new timeline.
                               4872                 :          */
 3031                          4873                 :         int         fd;
                               4874                 : 
  520 rhaas                    4875 GIC           9 :         fd = XLogFileInit(startLogSegNo, newTLI);
 3031 heikki.linnakangas       4876 ECB             : 
 1373 peter                    4877 CBC           9 :         if (close(fd) != 0)
 1223 michael                  4878 ECB             :         {
 1223 michael                  4879 LBC           0 :             int         save_errno = errno;
 1223 michael                  4880 ECB             : 
  520 rhaas                    4881 LBC           0 :             XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
 1223 michael                  4882               0 :             errno = save_errno;
 3031 heikki.linnakangas       4883 UIC           0 :             ereport(ERROR,
 3031 heikki.linnakangas       4884 ECB             :                     (errcode_for_file_access(),
 1223 michael                  4885                 :                      errmsg("could not close file \"%s\": %m", xlogfname)));
                               4886                 :         }
 3034 heikki.linnakangas       4887                 :     }
 6838 tgl                      4888                 : 
                               4889                 :     /*
 6385 bruce                    4890                 :      * Let's just make real sure there are not .ready or .done flags posted
                               4891                 :      * for the new segment.
 6838 tgl                      4892                 :      */
  520 rhaas                    4893 CBC          39 :     XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
 3090 fujii                    4894              39 :     XLogArchiveCleanup(xlogfname);
 6838 tgl                      4895 GIC          39 : }
                               4896                 : 
  543 rhaas                    4897 ECB             : /*
                               4898                 :  * Perform cleanup actions at the conclusion of archive recovery.
                               4899                 :  */
                               4900                 : static void
  520 rhaas                    4901 GIC          39 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
                               4902                 :                             TimeLineID newTLI)
                               4903                 : {
                               4904                 :     /*
                               4905                 :      * Execute the recovery_end_command, if any.
  543 rhaas                    4906 ECB             :      */
  543 rhaas                    4907 CBC          39 :     if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
   62 michael                  4908               2 :         ExecuteRecoveryCommand(recoveryEndCommand,
                               4909                 :                                "recovery_end_command",
                               4910                 :                                true,
   62 michael                  4911 EUB             :                                WAIT_EVENT_RECOVERY_END_COMMAND);
  543 rhaas                    4912                 : 
                               4913                 :     /*
                               4914                 :      * We switched to a new timeline. Clean up segments on the old timeline.
                               4915                 :      *
                               4916                 :      * If there are any higher-numbered segments on the old timeline, remove
  417 heikki.linnakangas       4917 ECB             :      * them. They might contain valid WAL, but they might also be
                               4918                 :      * pre-allocated files containing garbage. In any case, they are not part
                               4919                 :      * of the new timeline's history so we don't need them.
  543 rhaas                    4920                 :      */
  520 rhaas                    4921 GBC          39 :     RemoveNonParentXlogFiles(EndOfLog, newTLI);
                               4922                 : 
                               4923                 :     /*
  543 rhaas                    4924 ECB             :      * If the switch happened in the middle of a segment, what to do with the
                               4925                 :      * last, partial segment on the old timeline? If we don't archive it, and
  417 heikki.linnakangas       4926                 :      * the server that created the WAL never archives it either (e.g. because
  417 heikki.linnakangas       4927 EUB             :      * it was hit by a meteor), it will never make it to the archive. That's
                               4928                 :      * OK from our point of view, because the new segment that we created with
                               4929                 :      * the new TLI contains all the WAL from the old timeline up to the switch
                               4930                 :      * point. But if you later try to do PITR to the "missing" WAL on the old
  417 heikki.linnakangas       4931 ECB             :      * timeline, recovery won't find it in the archive. It's physically
                               4932                 :      * present in the new file with new TLI, but recovery won't look there
                               4933                 :      * when it's recovering to the older timeline. On the other hand, if we
                               4934                 :      * archive the partial segment, and the original server on that timeline
                               4935                 :      * is still running and archives the completed version of the same segment
                               4936                 :      * later, it will fail. (We used to do that in 9.4 and below, and it
                               4937                 :      * caused such problems).
                               4938                 :      *
                               4939                 :      * As a compromise, we rename the last segment with the .partial suffix,
                               4940                 :      * and archive it. Archive recovery will never try to read .partial
                               4941                 :      * segments, so they will normally go unused. But in the odd PITR case,
                               4942                 :      * the administrator can copy them manually to the pg_wal directory
                               4943                 :      * (removing the suffix). They can be useful in debugging, too.
  543 rhaas                    4944                 :      *
                               4945                 :      * If a .done or .ready file already exists for the old timeline, however,
  417 heikki.linnakangas       4946                 :      * we had already determined that the segment is complete, so we can let
                               4947                 :      * it be archived normally. (In particular, if it was restored from the
                               4948                 :      * archive to begin with, it's expected to have a .done file).
                               4949                 :      */
  543 rhaas                    4950 GIC          39 :     if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
                               4951              30 :         XLogArchivingActive())
                               4952                 :     {
                               4953                 :         char        origfname[MAXFNAMELEN];
  543 rhaas                    4954 ECB             :         XLogSegNo   endLogSegNo;
                               4955                 : 
  543 rhaas                    4956 GIC           6 :         XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
                               4957               6 :         XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
  543 rhaas                    4958 ECB             : 
  543 rhaas                    4959 GIC           6 :         if (!XLogArchiveIsReadyOrDone(origfname))
                               4960                 :         {
                               4961                 :             char        origpath[MAXPGPATH];
  543 rhaas                    4962 ECB             :             char        partialfname[MAXFNAMELEN];
                               4963                 :             char        partialpath[MAXPGPATH];
                               4964                 : 
  543 rhaas                    4965 GIC           4 :             XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
  543 rhaas                    4966 CBC           4 :             snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
  543 rhaas                    4967 GIC           4 :             snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
                               4968                 : 
                               4969                 :             /*
                               4970                 :              * Make sure there's no .done or .ready file for the .partial
                               4971                 :              * file.
                               4972                 :              */
  543 rhaas                    4973 CBC           4 :             XLogArchiveCleanup(partialfname);
                               4974                 : 
  543 rhaas                    4975 GIC           4 :             durable_rename(origpath, partialpath, ERROR);
                               4976               4 :             XLogArchiveNotify(partialfname);
                               4977                 :         }
                               4978                 :     }
                               4979              39 : }
  543 rhaas                    4980 ECB             : 
                               4981                 : /*
                               4982                 :  * Check to see if required parameters are set high enough on this server
                               4983                 :  * for various aspects of recovery operation.
                               4984                 :  *
  417 heikki.linnakangas       4985                 :  * Note that all the parameters which this function tests need to be
                               4986                 :  * listed in Administrator's Overview section in high-availability.sgml.
                               4987                 :  * If you change them, don't forget to update the list.
                               4988                 :  */
                               4989                 : static void
  417 heikki.linnakangas       4990 GIC         170 : CheckRequiredParameterValues(void)
                               4991                 : {
                               4992                 :     /*
  417 heikki.linnakangas       4993 ECB             :      * For archive recovery, the WAL must be generated with at least 'replica'
                               4994                 :      * wal_level.
                               4995                 :      */
  417 heikki.linnakangas       4996 GIC         170 :     if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
                               4997                 :     {
                               4998               2 :         ereport(FATAL,
                               4999                 :                 (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
                               5000                 :                  errdetail("This happens if you temporarily set wal_level=minimal on the server."),
                               5001                 :                  errhint("Use a backup taken after setting wal_level to higher than minimal.")));
 3377 heikki.linnakangas       5002 ECB             :     }
                               5003                 : 
                               5004                 :     /*
                               5005                 :      * For Hot Standby, the WAL must be generated with 'replica' mode, and we
                               5006                 :      * must have at least as many backend slots as the primary.
                               5007                 :      */
 3322 heikki.linnakangas       5008 GIC         168 :     if (ArchiveRecoveryRequested && EnableHotStandby)
                               5009                 :     {
                               5010                 :         /* We ignore autovacuum_max_workers when we make this test. */
 4729 heikki.linnakangas       5011 CBC          75 :         RecoveryRequiresIntParameter("max_connections",
 4728 tgl                      5012 ECB             :                                      MaxConnections,
 4728 tgl                      5013 GIC          75 :                                      ControlFile->MaxConnections);
 3566 rhaas                    5014              75 :         RecoveryRequiresIntParameter("max_worker_processes",
                               5015                 :                                      max_worker_processes,
                               5016              75 :                                      ControlFile->max_worker_processes);
 1517 michael                  5017              75 :         RecoveryRequiresIntParameter("max_wal_senders",
                               5018                 :                                      max_wal_senders,
                               5019              75 :                                      ControlFile->max_wal_senders);
 3868 tgl                      5020              75 :         RecoveryRequiresIntParameter("max_prepared_transactions",
                               5021                 :                                      max_prepared_xacts,
 4728 tgl                      5022 CBC          75 :                                      ControlFile->max_prepared_xacts);
 3868 tgl                      5023 GIC          75 :         RecoveryRequiresIntParameter("max_locks_per_transaction",
 4728 tgl                      5024 ECB             :                                      max_locks_per_xact,
 4728 tgl                      5025 GIC          75 :                                      ControlFile->max_locks_per_xact);
 4729 heikki.linnakangas       5026 EUB             :     }
 4859 simon                    5027 GIC         168 : }
 4859 simon                    5028 EUB             : 
 8586 vadim4o                  5029                 : /*
 8062 tgl                      5030                 :  * This must be called ONCE during postmaster or standalone-backend startup
                               5031                 :  */
                               5032                 : void
 8062 tgl                      5033 GIC        1176 : StartupXLOG(void)
                               5034                 : {
                               5035                 :     XLogCtlInsert *Insert;
                               5036                 :     CheckPoint  checkPoint;
                               5037                 :     bool        wasShutdown;
                               5038                 :     bool        didCrash;
                               5039                 :     bool        haveTblspcMap;
  417 heikki.linnakangas       5040 ECB             :     bool        haveBackupLabel;
                               5041                 :     XLogRecPtr  EndOfLog;
 2880                          5042                 :     TimeLineID  EndOfLogTLI;
                               5043                 :     TimeLineID  newTLI;
                               5044                 :     bool        performedWalRecovery;
                               5045                 :     EndOfWalRecoveryInfo *endOfRecoveryInfo;
                               5046                 :     XLogRecPtr  abortedRecPtr;
                               5047                 :     XLogRecPtr  missingContrecPtr;
 6505 tgl                      5048                 :     TransactionId oldestActiveXID;
  984 fujii                    5049 GIC        1176 :     bool        promoted = false;
                               5050                 : 
                               5051                 :     /*
                               5052                 :      * We should have an aux process resource owner to use, and we should not
                               5053                 :      * be in a transaction that's installed some other resowner.
 1726 tgl                      5054 ECB             :      */
 1726 tgl                      5055 CBC        1176 :     Assert(AuxProcessResourceOwner != NULL);
 1726 tgl                      5056 GIC        1176 :     Assert(CurrentResourceOwner == NULL ||
                               5057                 :            CurrentResourceOwner == AuxProcessResourceOwner);
                               5058            1176 :     CurrentResourceOwner = AuxProcessResourceOwner;
                               5059                 : 
                               5060                 :     /*
                               5061                 :      * Check that contents look valid.
                               5062                 :      */
 1248 peter                    5063            1176 :     if (!XRecOffIsValid(ControlFile->checkPoint))
 7202 tgl                      5064 UIC           0 :         ereport(FATAL,
                               5065                 :                 (errmsg("control file contains invalid checkpoint location")));
                               5066                 : 
 1248 peter                    5067 GIC        1176 :     switch (ControlFile->state)
 3587 tgl                      5068 ECB             :     {
 1248 peter                    5069 GIC        1031 :         case DB_SHUTDOWNED:
                               5070                 : 
                               5071                 :             /*
                               5072                 :              * This is the expected case, so don't be chatty in standalone
                               5073                 :              * mode
                               5074                 :              */
                               5075            1031 :             ereport(IsPostmasterEnvironment ? LOG : NOTICE,
                               5076                 :                     (errmsg("database system was shut down at %s",
                               5077                 :                             str_time(ControlFile->time))));
                               5078            1031 :             break;
                               5079                 : 
                               5080              14 :         case DB_SHUTDOWNED_IN_RECOVERY:
                               5081              14 :             ereport(LOG,
                               5082                 :                     (errmsg("database system was shut down in recovery at %s",
                               5083                 :                             str_time(ControlFile->time))));
                               5084              14 :             break;
                               5085                 : 
 1248 peter                    5086 UIC           0 :         case DB_SHUTDOWNING:
                               5087               0 :             ereport(LOG,
                               5088                 :                     (errmsg("database system shutdown was interrupted; last known up at %s",
                               5089                 :                             str_time(ControlFile->time))));
                               5090               0 :             break;
                               5091                 : 
                               5092               0 :         case DB_IN_CRASH_RECOVERY:
                               5093               0 :             ereport(LOG,
                               5094                 :                     (errmsg("database system was interrupted while in recovery at %s",
                               5095                 :                             str_time(ControlFile->time)),
                               5096                 :                      errhint("This probably means that some data is corrupted and"
 1248 peter                    5097 ECB             :                              " you will have to use the last backup for recovery.")));
 1248 peter                    5098 LBC           0 :             break;
                               5099                 : 
 1248 peter                    5100 GIC           4 :         case DB_IN_ARCHIVE_RECOVERY:
                               5101               4 :             ereport(LOG,
                               5102                 :                     (errmsg("database system was interrupted while in recovery at log time %s",
 1248 peter                    5103 ECB             :                             str_time(ControlFile->checkPointCopy.time)),
                               5104                 :                      errhint("If this has occurred more than once some data might be corrupted"
                               5105                 :                              " and you might need to choose an earlier recovery target.")));
 1248 peter                    5106 CBC           4 :             break;
                               5107                 : 
 1248 peter                    5108 GIC         127 :         case DB_IN_PRODUCTION:
                               5109             127 :             ereport(LOG,
                               5110                 :                     (errmsg("database system was interrupted; last known up at %s",
                               5111                 :                             str_time(ControlFile->time))));
 1248 peter                    5112 CBC         127 :             break;
 1248 peter                    5113 ECB             : 
 1248 peter                    5114 LBC           0 :         default:
 1248 peter                    5115 UIC           0 :             ereport(FATAL,
                               5116                 :                     (errmsg("control file contains invalid database cluster state")));
                               5117                 :     }
                               5118                 : 
                               5119                 :     /* This is just to allow attaching to startup process with a debugger */
 7352 tgl                      5120 ECB             : #ifdef XLOG_REPLAY_DELAY
                               5121                 :     if (ControlFile->state != DB_SHUTDOWNED)
 6929 bruce                    5122                 :         pg_usleep(60000000L);
 7352 tgl                      5123                 : #endif
                               5124                 : 
                               5125                 :     /*
 2362 rhaas                    5126                 :      * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
                               5127                 :      * someone has performed a copy for PITR, these directories may have been
                               5128                 :      * excluded and need to be re-created.
                               5129                 :      */
 5264 tgl                      5130 GIC        1176 :     ValidateXLOGDirectoryStructure();
                               5131                 : 
                               5132                 :     /* Set up timeout handler needed to report startup progress. */
  531 rhaas                    5133            1176 :     if (!IsBootstrapProcessingMode())
                               5134             871 :         RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
                               5135                 :                         startup_progress_timeout_handler);
                               5136                 : 
 1731 michael                  5137 ECB             :     /*----------
                               5138                 :      * If we previously crashed, perform a couple of actions:
                               5139                 :      *
                               5140                 :      * - The pg_wal directory may still include some temporary WAL segments
                               5141                 :      *   used when creating a new segment, so perform some clean up to not
                               5142                 :      *   bloat this path.  This is done first as there is no point to sync
 1147 peter                    5143                 :      *   this temporary data.
                               5144                 :      *
                               5145                 :      * - There might be data which we had written, intending to fsync it, but
                               5146                 :      *   which we had not actually fsync'd yet.  Therefore, a power failure in
                               5147                 :      *   the near future might cause earlier unflushed writes to be lost, even
                               5148                 :      *   though more recent data written to disk from here on would be
                               5149                 :      *   persisted.  To avoid that, fsync the entire data directory.
                               5150                 :      */
  417 heikki.linnakangas       5151 GIC        1176 :     if (ControlFile->state != DB_SHUTDOWNED &&
                               5152             145 :         ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
                               5153                 :     {
                               5154             131 :         RemoveTempXlogFiles();
  417 heikki.linnakangas       5155 CBC         131 :         SyncDataDirectory();
  368 andres                   5156 GIC         131 :         didCrash = true;
                               5157                 :     }
  368 andres                   5158 ECB             :     else
  368 andres                   5159 GIC        1045 :         didCrash = false;
  417 heikki.linnakangas       5160 ECB             : 
                               5161                 :     /*
                               5162                 :      * Prepare for WAL recovery if needed.
                               5163                 :      *
                               5164                 :      * InitWalRecovery analyzes the control file and the backup label file, if
                               5165                 :      * any.  It updates the in-memory ControlFile buffer according to the
                               5166                 :      * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
                               5167                 :      * It also applies the tablespace map file, if any.
                               5168                 :      */
  417 heikki.linnakangas       5169 CBC        1176 :     InitWalRecovery(ControlFile, &wasShutdown,
  417 heikki.linnakangas       5170 ECB             :                     &haveBackupLabel, &haveTblspcMap);
  417 heikki.linnakangas       5171 GIC        1176 :     checkPoint = ControlFile->checkPointCopy;
 8586 vadim4o                  5172 ECB             : 
                               5173                 :     /* initialize shared memory variables from the checkpoint record */
  971 andres                   5174 CBC        1176 :     ShmemVariableCache->nextXid = checkPoint.nextXid;
 8586 vadim4o                  5175 GIC        1176 :     ShmemVariableCache->nextOid = checkPoint.nextOid;
 8192                          5176            1176 :     ShmemVariableCache->oidCount = 0;
 6514 tgl                      5177            1176 :     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 2208 rhaas                    5178            1176 :     AdvanceOldestClogXid(checkPoint.oldestXid);
 4799 tgl                      5179            1176 :     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 2217 tgl                      5180 CBC        1176 :     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
 2659 mail                     5181 GIC        1176 :     SetCommitTsLimit(checkPoint.oldestCommitTsXid,
                               5182                 :                      checkPoint.newestCommitTsXid);
  971 andres                   5183            1176 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
                               5184                 : 
                               5185                 :     /*
                               5186                 :      * Clear out any old relcache cache files.  This is *necessary* if we do
                               5187                 :      * any WAL replay, since that would probably result in the cache files
                               5188                 :      * being out of sync with database reality.  In theory we could leave them
                               5189                 :      * in place if the database had been cleanly shut down, but it seems
                               5190                 :      * safest to just remove them always and let them be rebuilt during the
                               5191                 :      * first backend startup.  These files needs to be removed from all
                               5192                 :      * directories including pg_tblspc, however the symlinks are created only
                               5193                 :      * after reading tablespace_map file in case of archive recovery from
                               5194                 :      * backup, so needs to clear old relcache files here after creating
                               5195                 :      * symlinks.
  417 heikki.linnakangas       5196 ECB             :      */
  417 heikki.linnakangas       5197 GIC        1176 :     RelationCacheInitFileRemove();
                               5198                 : 
                               5199                 :     /*
                               5200                 :      * Initialize replication slots, before there's a chance to remove
                               5201                 :      * required resources.
 3355 rhaas                    5202 ECB             :      */
 3223 andres                   5203 CBC        1176 :     StartupReplicationSlots();
                               5204                 : 
 3324 rhaas                    5205 ECB             :     /*
                               5206                 :      * Startup logical state, needs to be setup now so we have proper data
                               5207                 :      * during crash recovery.
                               5208                 :      */
 3324 rhaas                    5209 GIC        1176 :     StartupReorderBuffer();
 3324 rhaas                    5210 ECB             : 
  802 rhaas                    5211 EUB             :     /*
                               5212                 :      * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
                               5213                 :      * been initialized and before we accept connections or begin WAL replay.
  802 rhaas                    5214 ECB             :      */
  802 rhaas                    5215 GIC        1176 :     StartupCLOG();
  802 rhaas                    5216 ECB             : 
                               5217                 :     /*
                               5218                 :      * Startup MultiXact. We need to do this early to be able to replay
                               5219                 :      * truncations.
                               5220                 :      */
 3418 alvherre                 5221 GIC        1176 :     StartupMultiXact();
 3418 alvherre                 5222 ECB             : 
                               5223                 :     /*
                               5224                 :      * Ditto for commit timestamps.  Activate the facility if the setting is
 1656 michael                  5225                 :      * enabled in the control file, as there should be no tracking of commit
                               5226                 :      * timestamps done when the setting was disabled.  This facility can be
                               5227                 :      * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
 2676 alvherre                 5228                 :      */
 1656 michael                  5229 GIC        1176 :     if (ControlFile->track_commit_timestamp)
 2676 alvherre                 5230               8 :         StartupCommitTs();
 2676 alvherre                 5231 ECB             : 
                               5232                 :     /*
 2902 andres                   5233 EUB             :      * Recover knowledge about replay progress of known replication partners.
                               5234                 :      */
 2902 andres                   5235 GIC        1176 :     StartupReplicationOrigin();
                               5236                 : 
 3709 heikki.linnakangas       5237 EUB             :     /*
                               5238                 :      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
                               5239                 :      * control file. On recovery, all unlogged relations are blown away, so
                               5240                 :      * the unlogged LSN counter can be reset too.
                               5241                 :      */
 3709 heikki.linnakangas       5242 GIC        1176 :     if (ControlFile->state == DB_SHUTDOWNED)
                               5243            1025 :         XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
                               5244                 :     else
 1260 michael                  5245 GBC         151 :         XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
                               5246                 : 
 3728 heikki.linnakangas       5247 ECB             :     /*
 3602 bruce                    5248                 :      * Copy any missing timeline history files between 'now' and the recovery
                               5249                 :      * target timeline from archive to pg_wal. While we don't need those files
                               5250                 :      * ourselves - the history file of the recovery target timeline covers all
                               5251                 :      * the previous timelines in the history too - a cascading standby server
                               5252                 :      * might be interested in them. Or, if you archive the WAL from this
  697 tgl                      5253                 :      * server to a different archive than the primary, it'd be good for all
                               5254                 :      * the history files to get archived there after failover, so that you can
                               5255                 :      * use one of the old timelines as a PITR target. Timeline history files
                               5256                 :      * are small, so it's better to copy them unnecessarily than not copy them
                               5257                 :      * and regret later.
                               5258                 :      */
  417 heikki.linnakangas       5259 CBC        1176 :     restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
                               5260                 : 
 2196 simon                    5261 EUB             :     /*
 2153 bruce                    5262                 :      * Before running in recovery, scan pg_twophase and fill in its status to
                               5263                 :      * be able to work on entries generated by redo.  Doing a scan before
                               5264                 :      * taking any recovery action has the merit to discard any 2PC files that
                               5265                 :      * are newer than the first record to replay, saving from any conflicts at
                               5266                 :      * replay.  This avoids as well any subsequent scans when doing recovery
                               5267                 :      * of the on-disk two-phase data.
                               5268                 :      */
 2196 simon                    5269 GIC        1176 :     restoreTwoPhaseData();
                               5270                 : 
                               5271                 :     /*
                               5272                 :      * When starting with crash recovery, reset pgstat data - it might not be
                               5273                 :      * valid. Otherwise restore pgstat data. It's safe to do this here,
                               5274                 :      * because postmaster will not yet have started any other processes.
                               5275                 :      *
                               5276                 :      * NB: Restoring replication slot stats relies on slot state to have
  368 andres                   5277 ECB             :      * already been restored from disk.
                               5278                 :      *
                               5279                 :      * TODO: With a bit of extra work we could just start with a pgstat file
                               5280                 :      * associated with the checkpoint redo location we're starting from.
                               5281                 :      */
  368 andres                   5282 GIC        1176 :     if (didCrash)
                               5283             131 :         pgstat_discard_stats();
                               5284                 :     else
                               5285            1045 :         pgstat_restore_stats();
                               5286                 : 
 4092 simon                    5287            1176 :     lastFullPageWrites = checkPoint.fullPageWrites;
                               5288                 : 
 3562 heikki.linnakangas       5289            1176 :     RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 3076                          5290            1176 :     doPageWrites = lastFullPageWrites;
                               5291                 : 
                               5292                 :     /* REDO */
 6822 tgl                      5293            1176 :     if (InRecovery)
                               5294                 :     {
                               5295                 :         /* Initialize state for RecoveryInProgress() */
  417 heikki.linnakangas       5296             151 :         SpinLockAcquire(&XLogCtl->info_lck);
                               5297             151 :         if (InArchiveRecovery)
  417 heikki.linnakangas       5298 CBC          73 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
  417 heikki.linnakangas       5299 ECB             :         else
  417 heikki.linnakangas       5300 GIC          78 :             XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
  417 heikki.linnakangas       5301 CBC         151 :         SpinLockRelease(&XLogCtl->info_lck);
  417 heikki.linnakangas       5302 ECB             : 
 6089 tgl                      5303                 :         /*
                               5304                 :          * Update pg_control to show that we are recovering and to show the
                               5305                 :          * selected checkpoint as the place we are starting from. We also mark
 6031 bruce                    5306                 :          * pg_control with any minimum recovery stop point obtained from a
                               5307                 :          * backup history file.
                               5308                 :          *
                               5309                 :          * No need to hold ControlFileLock yet, we aren't up far enough.
                               5310                 :          */
  417 heikki.linnakangas       5311 GIC         151 :         UpdateControlFile();
                               5312                 : 
                               5313                 :         /*
                               5314                 :          * If there was a backup label file, it's done its job and the info
                               5315                 :          * has now been propagated into pg_control.  We must get rid of the
  417 heikki.linnakangas       5316 ECB             :          * label file so that if we crash during recovery, we'll pick up at
                               5317                 :          * the latest recovery restartpoint instead of going all the way back
                               5318                 :          * to the backup start point.  It seems prudent though to just rename
                               5319                 :          * the file out of the way rather than delete it completely.
                               5320                 :          */
  417 heikki.linnakangas       5321 CBC         151 :         if (haveBackupLabel)
 3698 heikki.linnakangas       5322 ECB             :         {
  417 heikki.linnakangas       5323 CBC          51 :             unlink(BACKUP_LABEL_OLD);
                               5324              51 :             durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
 3698 heikki.linnakangas       5325 ECB             :         }
 4790 bruce                    5326                 : 
 4843 heikki.linnakangas       5327                 :         /*
  417                          5328                 :          * If there was a tablespace_map file, it's done its job and the
                               5329                 :          * symlinks have been created.  We must get rid of the map file so
                               5330                 :          * that if we crash during recovery, we don't create symlinks again.
                               5331                 :          * It seems prudent though to just rename the file out of the way
                               5332                 :          * rather than delete it completely.
                               5333                 :          */
  417 heikki.linnakangas       5334 GIC         151 :         if (haveTblspcMap)
                               5335                 :         {
                               5336               1 :             unlink(TABLESPACE_MAP_OLD);
                               5337               1 :             durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
                               5338                 :         }
                               5339                 : 
                               5340                 :         /*
                               5341                 :          * Initialize our local copy of minRecoveryPoint.  When doing crash
                               5342                 :          * recovery we want to replay up to the end of WAL.  Particularly, in
                               5343                 :          * the case of a promoted standby minRecoveryPoint value in the
 1739 michael                  5344 ECB             :          * control file is only updated after the first checkpoint.  However,
                               5345                 :          * if the instance crashes before the first post-recovery checkpoint
                               5346                 :          * is completed then recovery will use a stale location causing the
                               5347                 :          * startup process to think that there are still invalid page
                               5348                 :          * references when checking for data consistency.
                               5349                 :          */
 1739 michael                  5350 CBC         151 :         if (InArchiveRecovery)
                               5351                 :         {
  417 heikki.linnakangas       5352 GIC          73 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
                               5353              73 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                               5354                 :         }
                               5355                 :         else
 1739 michael                  5356 ECB             :         {
  417 heikki.linnakangas       5357 GIC          78 :             LocalMinRecoveryPoint = InvalidXLogRecPtr;
                               5358              78 :             LocalMinRecoveryPointTLI = 0;
                               5359                 :         }
                               5360                 : 
                               5361                 :         /* Check that the GUCs used to generate the WAL allow recovery */
 4729 heikki.linnakangas       5362 CBC         151 :         CheckRequiredParameterValues();
                               5363                 : 
                               5364                 :         /*
                               5365                 :          * We're in recovery, so unlogged relations may be trashed and must be
                               5366                 :          * reset.  This should be done BEFORE allowing Hot Standby
                               5367                 :          * connections, so that read-only backends don't try to read whatever
 3955 bruce                    5368 ECB             :          * garbage is left over from before.
                               5369                 :          */
 4484 rhaas                    5370 GIC         151 :         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
                               5371                 : 
                               5372                 :         /*
                               5373                 :          * Likewise, delete any saved transaction snapshot files that got left
                               5374                 :          * behind by crashed backends.
                               5375                 :          */
 4187 tgl                      5376 CBC         151 :         DeleteAllExportedSnapshotFiles();
 4187 tgl                      5377 ECB             : 
                               5378                 :         /*
                               5379                 :          * Initialize for Hot Standby, if enabled. We won't let backends in
                               5380                 :          * yet, not until we've reached the min recovery point specified in
                               5381                 :          * control file and we've established a recovery snapshot from a
 4859 simon                    5382                 :          * running-xacts WAL record.
                               5383                 :          */
 3698 heikki.linnakangas       5384 GIC         151 :         if (ArchiveRecoveryRequested && EnableHotStandby)
                               5385                 :         {
                               5386                 :             TransactionId *xids;
                               5387                 :             int         nxids;
                               5388                 : 
 4804 heikki.linnakangas       5389 CBC          71 :             ereport(DEBUG1,
  781 peter                    5390 ECB             :                     (errmsg_internal("initializing for hot standby")));
                               5391                 : 
 4859 simon                    5392 CBC          71 :             InitRecoveryTransactionEnvironment();
                               5393                 : 
 4859 simon                    5394 GIC          71 :             if (wasShutdown)
                               5395              12 :                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
                               5396                 :             else
                               5397              59 :                 oldestActiveXID = checkPoint.oldestActiveXid;
                               5398              71 :             Assert(TransactionIdIsValid(oldestActiveXID));
                               5399                 : 
                               5400                 :             /* Tell procarray about the range of xids it has to deal with */
  971 andres                   5401              71 :             ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
                               5402                 : 
                               5403                 :             /*
                               5404                 :              * Startup subtrans only.  CLOG, MultiXact and commit timestamp
                               5405                 :              * have already been started up and other SLRUs are not maintained
  697 tgl                      5406 ECB             :              * during recovery and need not be started yet.
                               5407                 :              */
 4859 simon                    5408 GIC          71 :             StartupSUBTRANS(oldestActiveXID);
                               5409                 : 
                               5410                 :             /*
                               5411                 :              * If we're beginning at a shutdown checkpoint, we know that
                               5412                 :              * nothing was running on the primary at this point. So fake-up an
                               5413                 :              * empty running-xacts record and use that here and now. Recover
                               5414                 :              * additional standby state for prepared transactions.
                               5415                 :              */
 4744 heikki.linnakangas       5416 CBC          71 :             if (wasShutdown)
                               5417                 :             {
                               5418                 :                 RunningTransactionsData running;
                               5419                 :                 TransactionId latestCompletedXid;
                               5420                 : 
                               5421                 :                 /*
                               5422                 :                  * Construct a RunningTransactions snapshot representing a
                               5423                 :                  * shut down server, with only prepared transactions still
                               5424                 :                  * alive. We're never overflowed at this point because all
                               5425                 :                  * subxids are listed with their parent prepared transactions.
                               5426                 :                  */
 4744 heikki.linnakangas       5427 GIC          12 :                 running.xcnt = nxids;
 3780 simon                    5428              12 :                 running.subxcnt = 0;
 4744 heikki.linnakangas       5429 CBC          12 :                 running.subxid_overflow = false;
  971 andres                   5430              12 :                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 4744 heikki.linnakangas       5431 GIC          12 :                 running.oldestRunningXid = oldestActiveXID;
  971 andres                   5432 CBC          12 :                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
 4714 simon                    5433 GIC          12 :                 TransactionIdRetreat(latestCompletedXid);
 4713 simon                    5434 CBC          12 :                 Assert(TransactionIdIsNormal(latestCompletedXid));
 4714 simon                    5435 GIC          12 :                 running.latestCompletedXid = latestCompletedXid;
 4744 heikki.linnakangas       5436 CBC          12 :                 running.xids = xids;
 4744 heikki.linnakangas       5437 ECB             : 
 4744 heikki.linnakangas       5438 GIC          12 :                 ProcArrayApplyRecoveryInfo(&running);
                               5439                 : 
 2173 simon                    5440 CBC          12 :                 StandbyRecoverPreparedTransactions();
                               5441                 :             }
                               5442                 :         }
 4859 simon                    5443 ECB             : 
 1166 peter                    5444                 :         /*
  417 heikki.linnakangas       5445                 :          * We're all set for replaying the WAL now. Do it.
                               5446                 :          */
  417 heikki.linnakangas       5447 CBC         151 :         PerformWalRecovery();
                               5448             117 :         performedWalRecovery = true;
                               5449                 :     }
                               5450                 :     else
  413 heikki.linnakangas       5451 GIC        1025 :         performedWalRecovery = false;
                               5452                 : 
                               5453                 :     /*
                               5454                 :      * Finish WAL recovery.
                               5455                 :      */
  417                          5456            1142 :     endOfRecoveryInfo = FinishWalRecovery();
                               5457            1142 :     EndOfLog = endOfRecoveryInfo->endOfLog;
  417 heikki.linnakangas       5458 CBC        1142 :     EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
  417 heikki.linnakangas       5459 GIC        1142 :     abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
                               5460            1142 :     missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
                               5461                 : 
                               5462                 :     /*
                               5463                 :      * Reset ps status display, so as no information related to recovery
                               5464                 :      * shows up.
                               5465                 :      */
  199 michael                  5466            1142 :     set_ps_display("");
                               5467                 : 
 6822 tgl                      5468 ECB             :     /*
                               5469                 :      * When recovering from a backup (we are in recovery, and archive recovery
  368 sfrost                   5470                 :      * was requested), complain if we did not roll forward far enough to reach
                               5471                 :      * the point where the database is consistent.  For regular online
                               5472                 :      * backup-from-primary, that means reaching the end-of-backup WAL record
                               5473                 :      * (at which point we reset backupStartPoint to be Invalid), for
                               5474                 :      * backup-from-replica (which can't inject records into the WAL stream),
                               5475                 :      * that point is when we reach the minRecoveryPoint in pg_control (which
                               5476                 :      * we purposefully copy last when backing up from a replica).  For
                               5477                 :      * pg_rewind (which creates a backup_label with a method of "pg_rewind")
                               5478                 :      * or snapshot-style backups (which don't), backupEndRequired will be set
                               5479                 :      * to false.
                               5480                 :      *
                               5481                 :      * Note: it is indeed okay to look at the local variable
                               5482                 :      * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
  417 heikki.linnakangas       5483                 :      * might be further ahead --- ControlFile->minRecoveryPoint cannot have
                               5484                 :      * been advanced beyond the WAL we processed.
                               5485                 :      */
 4393 heikki.linnakangas       5486 GIC        1142 :     if (InRecovery &&
  417                          5487             117 :         (EndOfLog < LocalMinRecoveryPoint ||
 4843                          5488             117 :          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
                               5489                 :     {
                               5490                 :         /*
                               5491                 :          * Ran off end of WAL before reaching end-of-backup WAL record, or
                               5492                 :          * minRecoveryPoint. That's a bad sign, indicating that you tried to
                               5493                 :          * recover from an online backup but never called pg_backup_stop(), or
                               5494                 :          * you didn't archive all the WAL needed.
                               5495                 :          */
 3698 heikki.linnakangas       5496 UIC           0 :         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
 4379 heikki.linnakangas       5497 ECB             :         {
  368 sfrost                   5498 UIC           0 :             if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
 4260 heikki.linnakangas       5499 LBC           0 :                 ereport(FATAL,
 4260 heikki.linnakangas       5500 ECB             :                         (errmsg("WAL ends before end of online backup"),
                               5501                 :                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
                               5502                 :             else
 4379 heikki.linnakangas       5503 UIC           0 :                 ereport(FATAL,
 2118 tgl                      5504 ECB             :                         (errmsg("WAL ends before consistent recovery point")));
 4379 heikki.linnakangas       5505                 :         }
                               5506                 :     }
                               5507                 : 
                               5508                 :     /*
  417                          5509                 :      * Reset unlogged relations to the contents of their INIT fork. This is
                               5510                 :      * done AFTER recovery is complete so as to include any unlogged relations
                               5511                 :      * created during recovery, but BEFORE recovery is marked as having
                               5512                 :      * completed successfully. Otherwise we'd not retry if any of the post
                               5513                 :      * end-of-recovery steps fail.
                               5514                 :      */
  417 heikki.linnakangas       5515 GIC        1142 :     if (InRecovery)
                               5516             117 :         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
  417 heikki.linnakangas       5517 ECB             : 
                               5518                 :     /*
                               5519                 :      * Pre-scan prepared transactions to find out the range of XIDs present.
                               5520                 :      * This information is not quite needed yet, but it is positioned here so
                               5521                 :      * as potential problems are detected before any on-disk change is done.
                               5522                 :      */
 1735 michael                  5523 CBC        1142 :     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
                               5524                 : 
                               5525                 :     /*
                               5526                 :      * Allow ordinary WAL segment creation before possibly switching to a new
                               5527                 :      * timeline, which creates a new segment, and after the last ReadRecord().
                               5528                 :      */
  235 michael                  5529 GNC        1142 :     SetInstallXLogFileSegmentActive();
                               5530                 : 
                               5531                 :     /*
                               5532                 :      * Consider whether we need to assign a new timeline ID.
                               5533                 :      *
  417 heikki.linnakangas       5534 ECB             :      * If we did archive recovery, we always assign a new ID.  This handles a
                               5535                 :      * couple of issues.  If we stopped short of the end of WAL during
                               5536                 :      * recovery, then we are clearly generating a new timeline and must assign
                               5537                 :      * it a unique new ID.  Even if we ran to the end, modifying the current
                               5538                 :      * last segment is problematic because it may result in trying to
 5624 bruce                    5539                 :      * overwrite an already-archived copy of that segment, and we encourage
 5671 tgl                      5540                 :      * DBAs to make their archive_commands reject that.  We can dodge the
                               5541                 :      * problem by making the new active segment have a new timeline ID.
                               5542                 :      *
                               5543                 :      * In a normal crash recovery, we can just extend the timeline we were in.
                               5544                 :      */
  417 heikki.linnakangas       5545 GIC        1142 :     newTLI = endOfRecoveryInfo->lastRecTLI;
 3698 heikki.linnakangas       5546 CBC        1142 :     if (ArchiveRecoveryRequested)
                               5547                 :     {
  515 rhaas                    5548 GIC          39 :         newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
 6836 tgl                      5549              39 :         ereport(LOG,
                               5550                 :                 (errmsg("selected new timeline ID: %u", newTLI)));
                               5551                 : 
                               5552                 :         /*
  417 heikki.linnakangas       5553 ECB             :          * Make a writable copy of the last WAL segment.  (Note that we also
                               5554                 :          * have a copy of the last block of the old WAL in
                               5555                 :          * endOfRecovery->lastPage; we will use that below.)
                               5556                 :          */
  417 heikki.linnakangas       5557 GIC          39 :         XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
                               5558                 : 
                               5559                 :         /*
                               5560                 :          * Remove the signal files out of the way, so that we don't
  417 heikki.linnakangas       5561 ECB             :          * accidentally re-enter archive recovery mode in a subsequent crash.
                               5562                 :          */
  417 heikki.linnakangas       5563 GIC          39 :         if (endOfRecoveryInfo->standby_signal_file_found)
                               5564              36 :             durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
                               5565                 : 
                               5566              39 :         if (endOfRecoveryInfo->recovery_signal_file_found)
                               5567               3 :             durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
                               5568                 : 
                               5569                 :         /*
                               5570                 :          * Write the timeline history file, and have it archived. After this
                               5571                 :          * point (or rather, as soon as the file is archived), the timeline
 1735 michael                  5572 ECB             :          * will appear as "taken" in the WAL archive and to any standby
                               5573                 :          * servers.  If we crash before actually switching to the new
                               5574                 :          * timeline, standby servers will nevertheless think that we switched
                               5575                 :          * to the new timeline, and will try to connect to the new timeline.
                               5576                 :          * To minimize the window for that, try to do as little as possible
                               5577                 :          * between here and writing the end-of-recovery record.
                               5578                 :          */
  515 rhaas                    5579 CBC          39 :         writeTimeLineHistory(newTLI, recoveryTargetTLI,
  417 heikki.linnakangas       5580 ECB             :                              EndOfLog, endOfRecoveryInfo->recoveryStopReason);
 1285 michael                  5581                 : 
  417 heikki.linnakangas       5582 GIC          39 :         ereport(LOG,
  417 heikki.linnakangas       5583 ECB             :                 (errmsg("archive recovery complete")));
                               5584                 :     }
 6836 tgl                      5585                 : 
                               5586                 :     /* Save the selected TimeLineID in shared memory, too */
  515 rhaas                    5587 GIC        1142 :     XLogCtl->InsertTimeLineID = newTLI;
  417 heikki.linnakangas       5588            1142 :     XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
                               5589                 : 
                               5590                 :     /*
                               5591                 :      * Actually, if WAL ended in an incomplete record, skip the parts that
  557 alvherre                 5592 ECB             :      * made it through and start writing after the portion that persisted.
                               5593                 :      * (It's critical to first write an OVERWRITE_CONTRECORD message, which
                               5594                 :      * we'll do as soon as we're open for writing new WAL.)
                               5595                 :      */
  557 alvherre                 5596 CBC        1142 :     if (!XLogRecPtrIsInvalid(missingContrecPtr))
                               5597                 :     {
                               5598                 :         /*
                               5599                 :          * We should only have a missingContrecPtr if we're not switching to
                               5600                 :          * a new timeline. When a timeline switch occurs, WAL is copied from
  223 rhaas                    5601 ECB             :          * the old timeline to the new only up to the end of the last complete
                               5602                 :          * record, so there can't be an incomplete WAL record that we need to
                               5603                 :          * disregard.
                               5604                 :          */
  223 rhaas                    5605 CBC           1 :         Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
  557 alvherre                 5606 GIC           1 :         Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
                               5607               1 :         EndOfLog = missingContrecPtr;
                               5608                 :     }
                               5609                 : 
                               5610                 :     /*
 2158 peter_e                  5611 ECB             :      * Prepare to write WAL starting at EndOfLog location, and init xlog
                               5612                 :      * buffer cache using the block containing the last record from the
                               5613                 :      * previous incarnation.
                               5614                 :      */
 8198 vadim4o                  5615 GIC        1142 :     Insert = &XLogCtl->Insert;
  417 heikki.linnakangas       5616            1142 :     Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
 3553                          5617            1142 :     Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
                               5618                 : 
                               5619                 :     /*
                               5620                 :      * Tricky point here: lastPage contains the *last* block that the LastRec
                               5621                 :      * record spans, not the one it starts in.  The last block is indeed the
                               5622                 :      * one we want to use.
                               5623                 :      */
                               5624            1142 :     if (EndOfLog % XLOG_BLCKSZ != 0)
                               5625                 :     {
                               5626                 :         char       *page;
                               5627                 :         int         len;
                               5628                 :         int         firstIdx;
                               5629                 : 
                               5630            1128 :         firstIdx = XLogRecPtrToBufIdx(EndOfLog);
  417 heikki.linnakangas       5631 CBC        1128 :         len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
                               5632            1128 :         Assert(len < XLOG_BLCKSZ);
 8198 vadim4o                  5633 ECB             : 
                               5634                 :         /* Copy the valid part of the last block, and zero the rest */
 3553 heikki.linnakangas       5635 GIC        1128 :         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
  417                          5636            1128 :         memcpy(page, endOfRecoveryInfo->lastPage, len);
 3553                          5637            1128 :         memset(page + len, 0, XLOG_BLCKSZ - len);
                               5638                 : 
  417                          5639            1128 :         XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
                               5640            1128 :         XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
 7206 tgl                      5641 EUB             :     }
                               5642                 :     else
                               5643                 :     {
                               5644                 :         /*
                               5645                 :          * There is no partial block to copy. Just set InitializedUpTo, and
                               5646                 :          * let the first attempt to insert a log record to initialize the next
                               5647                 :          * buffer.
                               5648                 :          */
 3553 heikki.linnakangas       5649 GIC          14 :         XLogCtl->InitializedUpTo = EndOfLog;
                               5650                 :     }
                               5651                 : 
                               5652            1142 :     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
                               5653                 : 
                               5654            1142 :     XLogCtl->LogwrtResult = LogwrtResult;
                               5655                 : 
                               5656            1142 :     XLogCtl->LogwrtRqst.Write = EndOfLog;
                               5657            1142 :     XLogCtl->LogwrtRqst.Flush = EndOfLog;
                               5658                 : 
                               5659                 :     /*
 8062 tgl                      5660 ECB             :      * Preallocate additional log files, if wanted.
                               5661                 :      */
  515 rhaas                    5662 GIC        1142 :     PreallocXlogFiles(EndOfLog, newTLI);
                               5663                 : 
                               5664                 :     /*
                               5665                 :      * Okay, we're officially UP.
                               5666                 :      */
 8198 vadim4o                  5667            1142 :     InRecovery = false;
 8586 vadim4o                  5668 ECB             : 
                               5669                 :     /* start the archive_timeout timer and LSN running */
 3553 heikki.linnakangas       5670 GIC        1142 :     XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
 2299 andres                   5671            1142 :     XLogCtl->lastSegSwitchLSN = EndOfLog;
                               5672                 : 
                               5673                 :     /* also initialize latestCompletedXid, to nextXid - 1 */
 4080 tgl                      5674 CBC        1142 :     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
  971 andres                   5675 GIC        1142 :     ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
                               5676            1142 :     FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
 4080 tgl                      5677            1142 :     LWLockRelease(ProcArrayLock);
                               5678                 : 
                               5679                 :     /*
                               5680                 :      * Start up subtrans, if not already done for hot standby.  (commit
                               5681                 :      * timestamps are started below, if necessary.)
                               5682                 :      */
 4859 simon                    5683            1142 :     if (standbyState == STANDBY_DISABLED)
                               5684            1103 :         StartupSUBTRANS(oldestActiveXID);
                               5685                 : 
                               5686                 :     /*
                               5687                 :      * Perform end of recovery actions for any SLRUs that need it.
                               5688                 :      */
 4176                          5689            1142 :     TrimCLOG();
 3418 alvherre                 5690 CBC        1142 :     TrimMultiXact();
 4176 simon                    5691 ECB             : 
                               5692                 :     /* Reload shared-memory state for prepared transactions */
 6505 tgl                      5693 CBC        1142 :     RecoverPreparedTransactions();
 6505 tgl                      5694 ECB             : 
                               5695                 :     /* Shut down xlogreader */
  417 heikki.linnakangas       5696 GIC        1142 :     ShutdownWalRecovery();
                               5697                 : 
                               5698                 :     /* Enable WAL writes for this backend only. */
  542 rhaas                    5699            1142 :     LocalSetXLogInsertAllowed();
                               5700                 : 
                               5701                 :     /* If necessary, write overwrite-contrecord before doing anything else */
  542 rhaas                    5702 CBC        1142 :     if (!XLogRecPtrIsInvalid(abortedRecPtr))
                               5703                 :     {
  542 rhaas                    5704 GIC           1 :         Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
  417 heikki.linnakangas       5705               1 :         CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
                               5706                 :     }
                               5707                 : 
  542 rhaas                    5708 ECB             :     /*
                               5709                 :      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
                               5710                 :      * record before resource manager writes cleanup WAL records or checkpoint
                               5711                 :      * record is written.
                               5712                 :      */
  542 rhaas                    5713 GIC        1142 :     Insert->fullPageWrites = lastFullPageWrites;
                               5714            1142 :     UpdateFullPageWrites();
                               5715                 : 
                               5716                 :     /*
                               5717                 :      * Emit checkpoint or end-of-recovery record in XLOG, if required.
                               5718                 :      */
  417 heikki.linnakangas       5719            1142 :     if (performedWalRecovery)
  542 rhaas                    5720             117 :         promoted = PerformRecoveryXLogAction();
                               5721                 : 
                               5722                 :     /*
                               5723                 :      * If any of the critical GUCs have changed, log them before we allow
 4729 heikki.linnakangas       5724 ECB             :      * backends to write WAL.
                               5725                 :      */
 4729 heikki.linnakangas       5726 GIC        1142 :     XLogReportParameters();
 4729 heikki.linnakangas       5727 ECB             : 
                               5728                 :     /* If this is archive recovery, perform post-recovery cleanup actions. */
  531 rhaas                    5729 GIC        1142 :     if (ArchiveRecoveryRequested)
  515                          5730              39 :         CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
                               5731                 : 
 3049 alvherre                 5732 ECB             :     /*
 2878 bruce                    5733                 :      * Local WAL inserts enabled, so it's time to finish initialization of
                               5734                 :      * commit timestamp.
                               5735                 :      */
 3049 alvherre                 5736 GIC        1142 :     CompleteCommitTsInitialization();
                               5737                 : 
                               5738                 :     /*
                               5739                 :      * All done with end-of-recovery actions.
                               5740                 :      *
 2448 peter_e                  5741 ECB             :      * Now allow backends to write WAL and update the control file status in
                               5742                 :      * consequence.  SharedRecoveryState, that controls if backends can write
                               5743                 :      * WAL, is updated while holding ControlFileLock to prevent other backends
                               5744                 :      * to look at an inconsistent state of the control file in shared memory.
                               5745                 :      * There is still a small window during which backends can write WAL and
                               5746                 :      * the control file is still referring to a system not in DB_IN_PRODUCTION
                               5747                 :      * state while looking at the on-disk control file.
                               5748                 :      *
                               5749                 :      * Also, we use info_lck to update SharedRecoveryState to ensure that
  792 michael                  5750                 :      * there are no race conditions concerning visibility of other recent
                               5751                 :      * updates to shared memory.
 5163 heikki.linnakangas       5752                 :      */
 2448 peter_e                  5753 GIC        1142 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                               5754            1142 :     ControlFile->state = DB_IN_PRODUCTION;
                               5755                 : 
 3121 andres                   5756            1142 :     SpinLockAcquire(&XLogCtl->info_lck);
 1080 michael                  5757            1142 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
 3121 andres                   5758            1142 :     SpinLockRelease(&XLogCtl->info_lck);
                               5759                 : 
 2448 peter_e                  5760 CBC        1142 :     UpdateControlFile();
                               5761            1142 :     LWLockRelease(ControlFileLock);
 2448 peter_e                  5762 ECB             : 
                               5763                 :     /*
                               5764                 :      * Shutdown the recovery environment.  This must occur after
                               5765                 :      * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
                               5766                 :      * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
                               5767                 :      * any session building a snapshot will not rely on KnownAssignedXids as
                               5768                 :      * RecoveryInProgress() would return false at this stage.  This is
  552 michael                  5769                 :      * particularly critical for prepared 2PC transactions, that would still
                               5770                 :      * need to be included in snapshots once recovery has ended.
                               5771                 :      */
  552 michael                  5772 GIC        1142 :     if (standbyState != STANDBY_DISABLED)
                               5773              39 :         ShutdownRecoveryTransactionEnvironment();
                               5774                 : 
 3769 heikki.linnakangas       5775 ECB             :     /*
 3602 bruce                    5776                 :      * If there were cascading standby servers connected to us, nudge any wal
                               5777                 :      * sender processes to notice that we've been promoted.
                               5778                 :      */
    1 andres                   5779 GNC        1142 :     WalSndWakeup(true, true);
 3709 heikki.linnakangas       5780 ECB             : 
                               5781                 :     /*
  697 tgl                      5782                 :      * If this was a promotion, request an (online) checkpoint now. This isn't
                               5783                 :      * required for consistency, but the last restartpoint might be far back,
                               5784                 :      * and in case of a crash, recovering from it might take a longer than is
                               5785                 :      * appropriate now that we're not in standby mode anymore.
                               5786                 :      */
  984 fujii                    5787 GIC        1142 :     if (promoted)
 3610 simon                    5788              36 :         RequestCheckpoint(CHECKPOINT_FORCE);
 5163 heikki.linnakangas       5789            1142 : }
                               5790                 : 
                               5791                 : /*
                               5792                 :  * Callback from PerformWalRecovery(), called when we switch from crash
                               5793                 :  * recovery to archive recovery mode.  Updates the control file accordingly.
 4744 heikki.linnakangas       5794 ECB             :  */
                               5795                 : void
  417 heikki.linnakangas       5796 GIC           2 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
 4744 heikki.linnakangas       5797 ECB             : {
                               5798                 :     /* initialize minRecoveryPoint to this record */
  417 heikki.linnakangas       5799 CBC           2 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  417 heikki.linnakangas       5800 GIC           2 :     ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
  417 heikki.linnakangas       5801 CBC           2 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
  417 heikki.linnakangas       5802 ECB             :     {
  417 heikki.linnakangas       5803 GIC           2 :         ControlFile->minRecoveryPoint = EndRecPtr;
                               5804               2 :         ControlFile->minRecoveryPointTLI = replayTLI;
                               5805                 :     }
                               5806                 :     /* update local copy */
  417 heikki.linnakangas       5807 CBC           2 :     LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
  417 heikki.linnakangas       5808 GIC           2 :     LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                               5809                 : 
                               5810                 :     /*
                               5811                 :      * The startup process can update its local copy of minRecoveryPoint from
  417 heikki.linnakangas       5812 ECB             :      * this point.
                               5813                 :      */
  417 heikki.linnakangas       5814 GIC           2 :     updateMinRecoveryPoint = true;
 1739 michael                  5815 ECB             : 
  417 heikki.linnakangas       5816 CBC           2 :     UpdateControlFile();
                               5817                 : 
                               5818                 :     /*
  417 heikki.linnakangas       5819 ECB             :      * We update SharedRecoveryState while holding the lock on ControlFileLock
                               5820                 :      * so both states are consistent in shared memory.
 3763                          5821                 :      */
  417 heikki.linnakangas       5822 CBC           2 :     SpinLockAcquire(&XLogCtl->info_lck);
  417 heikki.linnakangas       5823 GIC           2 :     XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
                               5824               2 :     SpinLockRelease(&XLogCtl->info_lck);
                               5825                 : 
                               5826               2 :     LWLockRelease(ControlFileLock);
                               5827               2 : }
 3763 heikki.linnakangas       5828 ECB             : 
  417                          5829                 : /*
                               5830                 :  * Callback from PerformWalRecovery(), called when we reach the end of backup.
                               5831                 :  * Updates the control file accordingly.
                               5832                 :  */
                               5833                 : void
  417 heikki.linnakangas       5834 CBC          51 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
  417 heikki.linnakangas       5835 ECB             : {
                               5836                 :     /*
                               5837                 :      * We have reached the end of base backup, as indicated by pg_control. The
                               5838                 :      * data on disk is now consistent (unless minRecovery point is further
                               5839                 :      * ahead, which can happen if we crashed during previous recovery).  Reset
                               5840                 :      * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
                               5841                 :      * make sure we don't allow starting up at an earlier point even if
                               5842                 :      * recovery is stopped and restarted soon after this.
                               5843                 :      */
  417 heikki.linnakangas       5844 CBC          51 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                               5845                 : 
  417 heikki.linnakangas       5846 GIC          51 :     if (ControlFile->minRecoveryPoint < EndRecPtr)
 4744 heikki.linnakangas       5847 ECB             :     {
  417 heikki.linnakangas       5848 GIC          49 :         ControlFile->minRecoveryPoint = EndRecPtr;
  417 heikki.linnakangas       5849 CBC          49 :         ControlFile->minRecoveryPointTLI = tli;
  417 heikki.linnakangas       5850 ECB             :     }
                               5851                 : 
  417 heikki.linnakangas       5852 GIC          51 :     ControlFile->backupStartPoint = InvalidXLogRecPtr;
                               5853              51 :     ControlFile->backupEndPoint = InvalidXLogRecPtr;
                               5854              51 :     ControlFile->backupEndRequired = false;
                               5855              51 :     UpdateControlFile();
                               5856                 : 
                               5857              51 :     LWLockRelease(ControlFileLock);
 4744 heikki.linnakangas       5858 CBC          51 : }
 4744 heikki.linnakangas       5859 ECB             : 
                               5860                 : /*
                               5861                 :  * Perform whatever XLOG actions are necessary at end of REDO.
                               5862                 :  *
                               5863                 :  * The goal here is to make sure that we'll be able to recover properly if
  543 rhaas                    5864                 :  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
                               5865                 :  * checkpoint rather than an on-line one. This is not particularly critical,
                               5866                 :  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
                               5867                 :  * us to have the rule that TLI only changes in shutdown checkpoints, which
                               5868                 :  * allows some extra error checking in xlog_redo.
                               5869                 :  */
                               5870                 : static bool
  543 rhaas                    5871 CBC         117 : PerformRecoveryXLogAction(void)
                               5872                 : {
  543 rhaas                    5873 GIC         117 :     bool        promoted = false;
  543 rhaas                    5874 ECB             : 
                               5875                 :     /*
                               5876                 :      * Perform a checkpoint to update all our recovery activity to disk.
                               5877                 :      *
                               5878                 :      * Note that we write a shutdown checkpoint rather than an on-line one.
                               5879                 :      * This is not particularly critical, but since we may be assigning a new
                               5880                 :      * TLI, using a shutdown checkpoint allows us to have the rule that TLI
  417 heikki.linnakangas       5881                 :      * only changes in shutdown checkpoints, which allows some extra error
                               5882                 :      * checking in xlog_redo.
                               5883                 :      *
                               5884                 :      * In promotion, only create a lightweight end-of-recovery record instead
                               5885                 :      * of a full checkpoint. A checkpoint is requested later, after we're
                               5886                 :      * fully out of recovery mode and already accepting queries.
                               5887                 :      */
  543 rhaas                    5888 GIC         156 :     if (ArchiveRecoveryRequested && IsUnderPostmaster &&
  417 heikki.linnakangas       5889              39 :         PromoteIsTriggered())
                               5890                 :     {
  543 rhaas                    5891              36 :         promoted = true;
                               5892                 : 
                               5893                 :         /*
                               5894                 :          * Insert a special WAL record to mark the end of recovery, since we
                               5895                 :          * aren't doing a checkpoint. That means that the checkpointer process
                               5896                 :          * may likely be in the middle of a time-smoothed restartpoint and
                               5897                 :          * could continue to be for minutes after this.  That sounds strange,
  417 heikki.linnakangas       5898 ECB             :          * but the effect is roughly the same and it would be stranger to try
                               5899                 :          * to come out of the restartpoint and then checkpoint. We request a
                               5900                 :          * checkpoint later anyway, just for safety.
  543 rhaas                    5901                 :          */
  543 rhaas                    5902 CBC          36 :         CreateEndOfRecoveryRecord();
  543 rhaas                    5903 ECB             :     }
                               5904                 :     else
                               5905                 :     {
  543 rhaas                    5906 CBC          81 :         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
                               5907                 :                           CHECKPOINT_IMMEDIATE |
                               5908                 :                           CHECKPOINT_WAIT);
                               5909                 :     }
                               5910                 : 
  543 rhaas                    5911 GIC         117 :     return promoted;
                               5912                 : }
                               5913                 : 
                               5914                 : /*
                               5915                 :  * Is the system still in recovery?
                               5916                 :  *
 5035 tgl                      5917 ECB             :  * Unlike testing InRecovery, this works in any process that's connected to
                               5918                 :  * shared memory.
                               5919                 :  */
                               5920                 : bool
 5163 heikki.linnakangas       5921 GIC    81142281 : RecoveryInProgress(void)
                               5922                 : {
                               5923                 :     /*
 4790 bruce                    5924 ECB             :      * We check shared state each time only until we leave recovery mode. We
                               5925                 :      * can't re-enter recovery, so there's no need to keep checking after the
                               5926                 :      * shared variable has once been seen false.
                               5927                 :      */
 5163 heikki.linnakangas       5928 GIC    81142281 :     if (!LocalRecoveryInProgress)
                               5929        77601222 :         return false;
                               5930                 :     else
                               5931                 :     {
 3425 heikki.linnakangas       5932 ECB             :         /*
                               5933                 :          * use volatile pointer to make sure we make a fresh read of the
                               5934                 :          * shared variable.
                               5935                 :          */
 5163 heikki.linnakangas       5936 GIC     3541059 :         volatile XLogCtlData *xlogctl = XLogCtl;
                               5937                 : 
 1080 michael                  5938         3541059 :         LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
                               5939                 : 
                               5940                 :         /*
 3425 heikki.linnakangas       5941 ECB             :          * Note: We don't need a memory barrier when we're still in recovery.
                               5942                 :          * We might exit recovery immediately after return, so the caller
                               5943                 :          * can't rely on 'true' meaning that we're still in recovery anyway.
                               5944                 :          */
 5163                          5945                 : 
 5163 heikki.linnakangas       5946 CBC     3541059 :         return LocalRecoveryInProgress;
                               5947                 :     }
 8062 tgl                      5948 ECB             : }
                               5949                 : 
                               5950                 : /*
                               5951                 :  * Returns current recovery state from shared memory.
 1080 michael                  5952                 :  *
                               5953                 :  * This returned state is kept consistent with the contents of the control
                               5954                 :  * file.  See details about the possible values of RecoveryState in xlog.h.
                               5955                 :  */
                               5956                 : RecoveryState
 1080 michael                  5957 GIC          25 : GetRecoveryState(void)
                               5958                 : {
 1080 michael                  5959 ECB             :     RecoveryState retval;
                               5960                 : 
 1080 michael                  5961 CBC          25 :     SpinLockAcquire(&XLogCtl->info_lck);
 1080 michael                  5962 GIC          25 :     retval = XLogCtl->SharedRecoveryState;
                               5963              25 :     SpinLockRelease(&XLogCtl->info_lck);
                               5964                 : 
                               5965              25 :     return retval;
                               5966                 : }
 1080 michael                  5967 ECB             : 
 5035 tgl                      5968                 : /*
                               5969                 :  * Is this process allowed to insert new WAL records?
                               5970                 :  *
                               5971                 :  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
                               5972                 :  * But we also have provisions for forcing the result "true" or "false"
                               5973                 :  * within specific processes regardless of the global state.
                               5974                 :  */
                               5975                 : bool
 5035 tgl                      5976 GIC    43544110 : XLogInsertAllowed(void)
                               5977                 : {
                               5978                 :     /*
 4790 bruce                    5979 ECB             :      * If value is "unconditionally true" or "unconditionally false", just
                               5980                 :      * return it.  This provides the normal fast path once recovery is known
                               5981                 :      * done.
                               5982                 :      */
 5035 tgl                      5983 GIC    43544110 :     if (LocalXLogInsertAllowed >= 0)
                               5984        43451207 :         return (bool) LocalXLogInsertAllowed;
                               5985                 : 
                               5986                 :     /*
                               5987                 :      * Else, must check to see if we're still in recovery.
                               5988                 :      */
 5035 tgl                      5989 CBC       92903 :     if (RecoveryInProgress())
 5035 tgl                      5990 GIC       87067 :         return false;
 5035 tgl                      5991 ECB             : 
                               5992                 :     /*
 4790 bruce                    5993                 :      * On exit from recovery, reset to "unconditionally true", since there is
                               5994                 :      * no need to keep checking.
                               5995                 :      */
 5035 tgl                      5996 GIC        5836 :     LocalXLogInsertAllowed = 1;
 5035 tgl                      5997 CBC        5836 :     return true;
 5035 tgl                      5998 ECB             : }
                               5999                 : 
                               6000                 : /*
                               6001                 :  * Make XLogInsertAllowed() return true in the current process only.
 4992                          6002                 :  *
                               6003                 :  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
                               6004                 :  * and even call LocalSetXLogInsertAllowed() again after that.
                               6005                 :  *
                               6006                 :  * Returns the previous value of LocalXLogInsertAllowed.
                               6007                 :  */
                               6008                 : static int
 5035 tgl                      6009 GIC        1169 : LocalSetXLogInsertAllowed(void)
                               6010                 : {
  417 heikki.linnakangas       6011            1169 :     int         oldXLogAllowed = LocalXLogInsertAllowed;
                               6012                 : 
 5035 tgl                      6013            1169 :     LocalXLogInsertAllowed = 1;
                               6014                 : 
  531 rhaas                    6015            1169 :     return oldXLogAllowed;
 5035 tgl                      6016 ECB             : }
                               6017                 : 
 7695                          6018                 : /*
                               6019                 :  * Return the current Redo pointer from shared memory.
                               6020                 :  *
                               6021                 :  * As a side-effect, the local RedoRecPtr copy is updated.
                               6022                 :  */
                               6023                 : XLogRecPtr
 8137 vadim4o                  6024 GIC      152956 : GetRedoRecPtr(void)
                               6025                 : {
                               6026                 :     XLogRecPtr  ptr;
                               6027                 : 
                               6028                 :     /*
                               6029                 :      * The possibly not up-to-date copy in XlogCtl is enough. Even if we
                               6030                 :      * grabbed a WAL insertion lock to read the authoritative value in
                               6031                 :      * Insert->RedoRecPtr, someone might update it just after we've released
                               6032                 :      * the lock.
 3562 heikki.linnakangas       6033 ECB             :      */
 3121 andres                   6034 CBC      152956 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   6035 GIC      152956 :     ptr = XLogCtl->RedoRecPtr;
 3121 andres                   6036 CBC      152956 :     SpinLockRelease(&XLogCtl->info_lck);
                               6037                 : 
 3562 heikki.linnakangas       6038 GIC      152956 :     if (RedoRecPtr < ptr)
                               6039             742 :         RedoRecPtr = ptr;
                               6040                 : 
 7695 tgl                      6041          152956 :     return RedoRecPtr;
                               6042                 : }
                               6043                 : 
                               6044                 : /*
                               6045                 :  * Return information needed to decide whether a modified block needs a
                               6046                 :  * full-page image to be included in the WAL record.
 3076 heikki.linnakangas       6047 ECB             :  *
                               6048                 :  * The returned values are cached copies from backend-private memory, and
                               6049                 :  * possibly out-of-date or, indeed, uninitialized, in which case they will
                               6050                 :  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
  482 rhaas                    6051                 :  * re-check them against up-to-date values, while holding the WAL insert lock.
                               6052                 :  */
                               6053                 : void
 3076 heikki.linnakangas       6054 GIC    19592404 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
                               6055                 : {
 3076 heikki.linnakangas       6056 CBC    19592404 :     *RedoRecPtr_p = RedoRecPtr;
 3076 heikki.linnakangas       6057 GIC    19592404 :     *doPageWrites_p = doPageWrites;
                               6058        19592404 : }
                               6059                 : 
                               6060                 : /*
                               6061                 :  * GetInsertRecPtr -- Returns the current insert position.
                               6062                 :  *
                               6063                 :  * NOTE: The value *actually* returned is the position of the last full
                               6064                 :  * xlog page. It lags behind the real insert position by at most 1 page.
                               6065                 :  * For that, we don't need to scan through WAL insertion locks, and an
 3562 heikki.linnakangas       6066 ECB             :  * approximation is enough for the current usage of this function.
                               6067                 :  */
                               6068                 : XLogRecPtr
 5764 tgl                      6069 GIC        2424 : GetInsertRecPtr(void)
                               6070                 : {
                               6071                 :     XLogRecPtr  recptr;
                               6072                 : 
 3121 andres                   6073 CBC        2424 :     SpinLockAcquire(&XLogCtl->info_lck);
                               6074            2424 :     recptr = XLogCtl->LogwrtRqst.Write;
 3121 andres                   6075 GIC        2424 :     SpinLockRelease(&XLogCtl->info_lck);
                               6076                 : 
 5764 tgl                      6077            2424 :     return recptr;
                               6078                 : }
                               6079                 : 
                               6080                 : /*
 4679 tgl                      6081 ECB             :  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
                               6082                 :  * position known to be fsync'd to disk. This should only be used on a
  515 rhaas                    6083                 :  * system that is known not to be in recovery.
                               6084                 :  */
                               6085                 : XLogRecPtr
  520 rhaas                    6086 GIC      147978 : GetFlushRecPtr(TimeLineID *insertTLI)
                               6087                 : {
  515                          6088          147978 :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
                               6089                 : 
 3121 andres                   6090          147978 :     SpinLockAcquire(&XLogCtl->info_lck);
 2644 simon                    6091 CBC      147978 :     LogwrtResult = XLogCtl->LogwrtResult;
 3121 andres                   6092 GIC      147978 :     SpinLockRelease(&XLogCtl->info_lck);
                               6093                 : 
                               6094                 :     /*
                               6095                 :      * If we're writing and flushing WAL, the time line can't be changing, so
                               6096                 :      * no lock is required.
                               6097                 :      */
  520 rhaas                    6098          147978 :     if (insertTLI)
  515                          6099           24724 :         *insertTLI = XLogCtl->InsertTimeLineID;
                               6100                 : 
 2644 simon                    6101          147978 :     return LogwrtResult.Flush;
 4832 heikki.linnakangas       6102 ECB             : }
                               6103                 : 
                               6104                 : /*
                               6105                 :  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
  520 rhaas                    6106                 :  * is not in recovery.
                               6107                 :  */
                               6108                 : TimeLineID
  520 rhaas                    6109 GIC       10349 : GetWALInsertionTimeLine(void)
  520 rhaas                    6110 ECB             : {
  520 rhaas                    6111 GIC       10349 :     Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
                               6112                 : 
                               6113                 :     /* Since the value can't be changing, no lock is required. */
  515                          6114           10349 :     return XLogCtl->InsertTimeLineID;
                               6115                 : }
                               6116                 : 
                               6117                 : /*
                               6118                 :  * GetLastImportantRecPtr -- Returns the LSN of the last important record
                               6119                 :  * inserted. All records not explicitly marked as unimportant are considered
                               6120                 :  * important.
 2299 andres                   6121 ECB             :  *
                               6122                 :  * The LSN is determined by computing the maximum of
                               6123                 :  * WALInsertLocks[i].lastImportantAt.
                               6124                 :  */
                               6125                 : XLogRecPtr
 2299 andres                   6126 GIC        2386 : GetLastImportantRecPtr(void)
                               6127                 : {
 2299 andres                   6128 CBC        2386 :     XLogRecPtr  res = InvalidXLogRecPtr;
 2299 andres                   6129 ECB             :     int         i;
                               6130                 : 
 2299 andres                   6131 GIC       21474 :     for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
                               6132                 :     {
                               6133                 :         XLogRecPtr  last_important;
 2299 andres                   6134 ECB             : 
                               6135                 :         /*
                               6136                 :          * Need to take a lock to prevent torn reads of the LSN, which are
                               6137                 :          * possible on some of the supported platforms. WAL insert locks only
                               6138                 :          * support exclusive mode, so we have to use that.
                               6139                 :          */
 2299 andres                   6140 GIC       19088 :         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
 2299 andres                   6141 CBC       19088 :         last_important = WALInsertLocks[i].l.lastImportantAt;
                               6142           19088 :         LWLockRelease(&WALInsertLocks[i].l.lock);
                               6143                 : 
 2299 andres                   6144 GIC       19088 :         if (res < last_important)
                               6145            2619 :             res = last_important;
                               6146                 :     }
                               6147                 : 
                               6148            2386 :     return res;
                               6149                 : }
                               6150                 : 
                               6151                 : /*
                               6152                 :  * Get the time and LSN of the last xlog segment switch
                               6153                 :  */
 5530 tgl                      6154 ECB             : pg_time_t
 2299 andres                   6155 UIC           0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
 6079 tgl                      6156 ECB             : {
                               6157                 :     pg_time_t   result;
                               6158                 : 
                               6159                 :     /* Need WALWriteLock, but shared lock is sufficient */
 6079 tgl                      6160 LBC           0 :     LWLockAcquire(WALWriteLock, LW_SHARED);
 3553 heikki.linnakangas       6161 UIC           0 :     result = XLogCtl->lastSegSwitchTime;
 2299 andres                   6162               0 :     *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
 6079 tgl                      6163               0 :     LWLockRelease(WALWriteLock);
                               6164                 : 
                               6165               0 :     return result;
                               6166                 : }
                               6167                 : 
                               6168                 : /*
 8062 tgl                      6169 ECB             :  * This must be called ONCE during postmaster or standalone-backend shutdown
                               6170                 :  */
                               6171                 : void
 7058 peter_e                  6172 GIC         971 : ShutdownXLOG(int code, Datum arg)
                               6173                 : {
                               6174                 :     /*
                               6175                 :      * We should have an aux process resource owner to use, and we should not
                               6176                 :      * be in a transaction that's installed some other resowner.
                               6177                 :      */
 1726 tgl                      6178             971 :     Assert(AuxProcessResourceOwner != NULL);
 1726 tgl                      6179 CBC         971 :     Assert(CurrentResourceOwner == NULL ||
 1726 tgl                      6180 ECB             :            CurrentResourceOwner == AuxProcessResourceOwner);
 1726 tgl                      6181 CBC         971 :     CurrentResourceOwner = AuxProcessResourceOwner;
                               6182                 : 
 3587 tgl                      6183 ECB             :     /* Don't be chatty in standalone mode */
 3587 tgl                      6184 CBC         971 :     ereport(IsPostmasterEnvironment ? LOG : NOTICE,
                               6185                 :             (errmsg("shutting down")));
 8586 vadim4o                  6186 ECB             : 
                               6187                 :     /*
                               6188                 :      * Signal walsenders to move to stopping state.
                               6189                 :      */
 2134 andres                   6190 GIC         971 :     WalSndInitStopping();
                               6191                 : 
                               6192                 :     /*
                               6193                 :      * Wait for WAL senders to be in stopping state.  This prevents commands
                               6194                 :      * from writing new WAL.
                               6195                 :      */
                               6196             971 :     WalSndWaitStopping();
                               6197                 : 
 5163 heikki.linnakangas       6198             971 :     if (RecoveryInProgress())
 5163 heikki.linnakangas       6199 CBC          31 :         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
                               6200                 :     else
 5064 heikki.linnakangas       6201 ECB             :     {
                               6202                 :         /*
                               6203                 :          * If archiving is enabled, rotate the last XLOG file so that all the
                               6204                 :          * remaining records are archived (postmaster wakes up the archiver
                               6205                 :          * process one more time at the end of shutdown). The checkpoint
                               6206                 :          * record will go to the next XLOG file and won't be archived (yet).
                               6207                 :          */
  430 rhaas                    6208 GIC         940 :         if (XLogArchivingActive())
 2299 andres                   6209               9 :             RequestXLogSwitch(false);
                               6210                 : 
 5163 heikki.linnakangas       6211             940 :         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
                               6212                 :     }
 8586 vadim4o                  6213             971 : }
 8586 vadim4o                  6214 ECB             : 
                               6215                 : /*
                               6216                 :  * Log start of a checkpoint.
                               6217                 :  */
 5762 tgl                      6218                 : static void
 5163 heikki.linnakangas       6219 CBC         541 : LogCheckpointStart(int flags, bool restartpoint)
 5762 tgl                      6220 ECB             : {
  856 peter                    6221 GIC         541 :     if (restartpoint)
  856 peter                    6222 CBC          28 :         ereport(LOG,
                               6223                 :         /* translator: the placeholders show checkpoint options */
                               6224                 :                 (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
                               6225                 :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                               6226                 :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
                               6227                 :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
                               6228                 :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
                               6229                 :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
                               6230                 :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
  856 peter                    6231 ECB             :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
                               6232                 :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
                               6233                 :     else
  856 peter                    6234 GIC         513 :         ereport(LOG,
  697 tgl                      6235 ECB             :         /* translator: the placeholders show checkpoint options */
  856 peter                    6236                 :                 (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
                               6237                 :                         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                               6238                 :                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
                               6239                 :                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
                               6240                 :                         (flags & CHECKPOINT_FORCE) ? " force" : "",
                               6241                 :                         (flags & CHECKPOINT_WAIT) ? " wait" : "",
                               6242                 :                         (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
                               6243                 :                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
                               6244                 :                         (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
 5762 tgl                      6245 GIC         541 : }
 5762 tgl                      6246 ECB             : 
                               6247                 : /*
                               6248                 :  * Log end of a checkpoint.
                               6249                 :  */
                               6250                 : static void
 5163 heikki.linnakangas       6251 GIC        2363 : LogCheckpointEnd(bool restartpoint)
                               6252                 : {
                               6253                 :     long        write_msecs,
  880 tgl                      6254 ECB             :                 sync_msecs,
                               6255                 :                 total_msecs,
                               6256                 :                 longest_msecs,
                               6257                 :                 average_msecs;
                               6258                 :     uint64      average_sync_time;
 5762                          6259                 : 
 5762 tgl                      6260 GIC        2363 :     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
                               6261                 : 
  880                          6262            2363 :     write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
                               6263                 :                                                   CheckpointStats.ckpt_sync_t);
                               6264                 : 
                               6265            2363 :     sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
                               6266                 :                                                  CheckpointStats.ckpt_sync_end_t);
                               6267                 : 
                               6268                 :     /* Accumulate checkpoint timing summary data, in milliseconds. */
  368 andres                   6269            2363 :     PendingCheckpointerStats.checkpoint_write_time += write_msecs;
                               6270            2363 :     PendingCheckpointerStats.checkpoint_sync_time += sync_msecs;
 4021 rhaas                    6271 ECB             : 
                               6272                 :     /*
                               6273                 :      * All of the published timing statistics are accounted for.  Only
                               6274                 :      * continue if a log message is to be written.
                               6275                 :      */
 4021 rhaas                    6276 CBC        2363 :     if (!log_checkpoints)
 4021 rhaas                    6277 GIC        1822 :         return;
                               6278                 : 
  880 tgl                      6279             541 :     total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
                               6280                 :                                                   CheckpointStats.ckpt_end_t);
                               6281                 : 
                               6282                 :     /*
                               6283                 :      * Timing values returned from CheckpointStats are in microseconds.
                               6284                 :      * Convert to milliseconds for consistent printing.
 4499 rhaas                    6285 ECB             :      */
  880 tgl                      6286 CBC         541 :     longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
 4499 rhaas                    6287 ECB             : 
 4499 rhaas                    6288 GIC         541 :     average_sync_time = 0;
 4382 bruce                    6289 CBC         541 :     if (CheckpointStats.ckpt_sync_rels > 0)
 4499 rhaas                    6290 LBC           0 :         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
 4499 rhaas                    6291 UIC           0 :             CheckpointStats.ckpt_sync_rels;
  880 tgl                      6292 GIC         541 :     average_msecs = (long) ((average_sync_time + 999) / 1000);
 4499 rhaas                    6293 ECB             : 
                               6294                 :     /*
                               6295                 :      * ControlFileLock is not required to see ControlFile->checkPoint and
                               6296                 :      * ->checkPointCopy here as we are the only updator of those variables at
                               6297                 :      * this moment.
                               6298                 :      */
  856 peter                    6299 GIC         541 :     if (restartpoint)
                               6300              28 :         ereport(LOG,
                               6301                 :                 (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
                               6302                 :                         "%d WAL file(s) added, %d removed, %d recycled; "
                               6303                 :                         "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
                               6304                 :                         "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
                               6305                 :                         "distance=%d kB, estimate=%d kB; "
                               6306                 :                         "lsn=%X/%X, redo lsn=%X/%X",
                               6307                 :                         CheckpointStats.ckpt_bufs_written,
                               6308                 :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
                               6309                 :                         CheckpointStats.ckpt_segs_added,
                               6310                 :                         CheckpointStats.ckpt_segs_removed,
  856 peter                    6311 EUB             :                         CheckpointStats.ckpt_segs_recycled,
                               6312                 :                         write_msecs / 1000, (int) (write_msecs % 1000),
                               6313                 :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
                               6314                 :                         total_msecs / 1000, (int) (total_msecs % 1000),
                               6315                 :                         CheckpointStats.ckpt_sync_rels,
                               6316                 :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
                               6317                 :                         average_msecs / 1000, (int) (average_msecs % 1000),
                               6318                 :                         (int) (PrevCheckPointDistance / 1024.0),
                               6319                 :                         (int) (CheckPointDistanceEstimate / 1024.0),
                               6320                 :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
                               6321                 :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
                               6322                 :     else
  856 peter                    6323 GIC         513 :         ereport(LOG,
                               6324                 :                 (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
  856 peter                    6325 ECB             :                         "%d WAL file(s) added, %d removed, %d recycled; "
                               6326                 :                         "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
                               6327                 :                         "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
                               6328                 :                         "distance=%d kB, estimate=%d kB; "
                               6329                 :                         "lsn=%X/%X, redo lsn=%X/%X",
                               6330                 :                         CheckpointStats.ckpt_bufs_written,
                               6331                 :                         (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
                               6332                 :                         CheckpointStats.ckpt_segs_added,
                               6333                 :                         CheckpointStats.ckpt_segs_removed,
                               6334                 :                         CheckpointStats.ckpt_segs_recycled,
                               6335                 :                         write_msecs / 1000, (int) (write_msecs % 1000),
                               6336                 :                         sync_msecs / 1000, (int) (sync_msecs % 1000),
                               6337                 :                         total_msecs / 1000, (int) (total_msecs % 1000),
                               6338                 :                         CheckpointStats.ckpt_sync_rels,
                               6339                 :                         longest_msecs / 1000, (int) (longest_msecs % 1000),
                               6340                 :                         average_msecs / 1000, (int) (average_msecs % 1000),
                               6341                 :                         (int) (PrevCheckPointDistance / 1024.0),
                               6342                 :                         (int) (CheckPointDistanceEstimate / 1024.0),
                               6343                 :                         LSN_FORMAT_ARGS(ControlFile->checkPoint),
                               6344                 :                         LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
                               6345                 : }
 2967 heikki.linnakangas       6346                 : 
                               6347                 : /*
                               6348                 :  * Update the estimate of distance between checkpoints.
                               6349                 :  *
                               6350                 :  * The estimate is used to calculate the number of WAL segments to keep
                               6351                 :  * preallocated, see XLOGfileslop().
                               6352                 :  */
                               6353                 : static void
 2967 heikki.linnakangas       6354 CBC        2363 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
 2967 heikki.linnakangas       6355 ECB             : {
                               6356                 :     /*
                               6357                 :      * To estimate the number of segments consumed between checkpoints, keep a
                               6358                 :      * moving average of the amount of WAL generated in previous checkpoint
                               6359                 :      * cycles. However, if the load is bursty, with quiet periods and busy
                               6360                 :      * periods, we want to cater for the peak load. So instead of a plain
                               6361                 :      * moving average, let the average decline slowly if the previous cycle
                               6362                 :      * used less WAL than estimated, but bump it up immediately if it used
                               6363                 :      * more.
                               6364                 :      *
                               6365                 :      * When checkpoints are triggered by max_wal_size, this should converge to
                               6366                 :      * CheckpointSegments * wal_segment_size,
                               6367                 :      *
                               6368                 :      * Note: This doesn't pay any attention to what caused the checkpoint.
                               6369                 :      * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
                               6370                 :      * starting a base backup, are counted the same as those created
                               6371                 :      * automatically. The slow-decline will largely mask them out, if they are
                               6372                 :      * not frequent. If they are frequent, it seems reasonable to count them
                               6373                 :      * in as any others; if you issue a manual checkpoint every 5 minutes and
                               6374                 :      * never let a timed checkpoint happen, it makes sense to base the
                               6375                 :      * preallocation on that 5 minute interval rather than whatever
                               6376                 :      * checkpoint_timeout is set to.
                               6377                 :      */
 2967 heikki.linnakangas       6378 CBC        2363 :     PrevCheckPointDistance = nbytes;
 2967 heikki.linnakangas       6379 GIC        2363 :     if (CheckPointDistanceEstimate < nbytes)
                               6380            1015 :         CheckPointDistanceEstimate = nbytes;
                               6381                 :     else
                               6382            1348 :         CheckPointDistanceEstimate =
                               6383            1348 :             (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
 5762 tgl                      6384            2363 : }
                               6385                 : 
                               6386                 : /*
                               6387                 :  * Update the ps display for a process running a checkpoint.  Note that
                               6388                 :  * this routine should not do any allocations so as it can be called
                               6389                 :  * from a critical section.
  846 michael                  6390 ECB             :  */
                               6391                 : static void
  846 michael                  6392 GIC        4726 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
                               6393                 : {
                               6394                 :     /*
                               6395                 :      * The status is reported only for end-of-recovery and shutdown
                               6396                 :      * checkpoints or shutdown restartpoints.  Updating the ps display is
                               6397                 :      * useful in those situations as it may not be possible to rely on
                               6398                 :      * pg_stat_activity to see the status of the checkpointer or the startup
                               6399                 :      * process.
                               6400                 :      */
  846 michael                  6401 CBC        4726 :     if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
  846 michael                  6402 GIC        2766 :         return;
                               6403                 : 
                               6404            1960 :     if (reset)
                               6405             980 :         set_ps_display("");
                               6406                 :     else
  846 michael                  6407 ECB             :     {
                               6408                 :         char        activitymsg[128];
                               6409                 : 
  846 michael                  6410 GIC        2940 :         snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
                               6411             980 :                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
                               6412             980 :                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
                               6413                 :                  restartpoint ? "restartpoint" : "checkpoint");
                               6414             980 :         set_ps_display(activitymsg);
                               6415                 :     }
  846 michael                  6416 ECB             : }
                               6417                 : 
                               6418                 : 
                               6419                 : /*
                               6420                 :  * Perform a checkpoint --- either during shutdown, or on-the-fly
 7500 tgl                      6421                 :  *
                               6422                 :  * flags is a bitwise OR of the following:
                               6423                 :  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
                               6424                 :  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
 5764                          6425                 :  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
 5730                          6426                 :  *      ignoring checkpoint_completion_target parameter.
                               6427                 :  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
                               6428                 :  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
                               6429                 :  *      CHECKPOINT_END_OF_RECOVERY).
                               6430                 :  *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
                               6431                 :  *
 5762                          6432                 :  * Note: flags contains other bits, of interest here only for logging purposes.
 5764                          6433                 :  * In particular note that this routine is synchronous and does not pay
                               6434                 :  * attention to CHECKPOINT_WAIT.
 3780 simon                    6435                 :  *
                               6436                 :  * If !shutdown then we are writing an online checkpoint. This is a very special
                               6437                 :  * kind of operation and WAL record because the checkpoint action occurs over
                               6438                 :  * a period of time yet logically occurs at just a single LSN. The logical
                               6439                 :  * position of the WAL record (redo ptr) is the same or earlier than the
                               6440                 :  * physical position. When we replay WAL we locate the checkpoint via its
                               6441                 :  * physical position then read the redo ptr and actually start replay at the
                               6442                 :  * earlier logical position. Note that we don't write *anything* to WAL at
                               6443                 :  * the logical position, so that location could be any other kind of WAL record.
                               6444                 :  * All of this mechanism allows us to continue working while we checkpoint.
                               6445                 :  * As a result, timing of actions is critical here and be careful to note that
 3780 simon                    6446 EUB             :  * this function will likely take minutes to execute on a busy system.
 8062 tgl                      6447                 :  */
 8595 vadim4o                  6448 ECB             : void
 5764 tgl                      6449 GIC        2340 : CreateCheckPoint(int flags)
                               6450                 : {
                               6451                 :     bool        shutdown;
                               6452                 :     CheckPoint  checkPoint;
                               6453                 :     XLogRecPtr  recptr;
                               6454                 :     XLogSegNo   _logSegNo;
 8397 bruce                    6455 CBC        2340 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
 8397 bruce                    6456 ECB             :     uint32      freespace;
                               6457                 :     XLogRecPtr  PriorRedoPtr;
                               6458                 :     XLogRecPtr  curInsert;
                               6459                 :     XLogRecPtr  last_important_lsn;
                               6460                 :     VirtualTransactionId *vxids;
                               6461                 :     int         nvxids;
  531 rhaas                    6462 GIC        2340 :     int         oldXLogAllowed = 0;
                               6463                 : 
                               6464                 :     /*
                               6465                 :      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
                               6466                 :      * issued at a different time.
                               6467                 :      */
 5035 tgl                      6468            2340 :     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
 5036 heikki.linnakangas       6469             967 :         shutdown = true;
                               6470                 :     else
                               6471            1373 :         shutdown = false;
                               6472                 : 
                               6473                 :     /* sanity check */
 5035 tgl                      6474            2340 :     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
 5035 tgl                      6475 UIC           0 :         elog(ERROR, "can't create a checkpoint during recovery");
                               6476                 : 
                               6477                 :     /*
                               6478                 :      * Prepare to accumulate statistics.
 5762 tgl                      6479 ECB             :      *
                               6480                 :      * Note: because it is possible for log_checkpoints to change while a
                               6481                 :      * checkpoint proceeds, we always accumulate stats, even if
                               6482                 :      * log_checkpoints is currently off.
                               6483                 :      */
 5762 tgl                      6484 GIC       25740 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
                               6485            2340 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
                               6486                 : 
                               6487                 :     /*
                               6488                 :      * Let smgr prepare for checkpoint; this has to happen outside the
                               6489                 :      * critical section and before we determine the REDO pointer.  Note that
                               6490                 :      * smgr must not do anything that'd have to be undone if we decide no
                               6491                 :      * checkpoint is needed.
                               6492                 :      */
  389 tmunro                   6493            2340 :     SyncPreCheckpoint();
                               6494                 : 
                               6495                 :     /*
                               6496                 :      * Use a critical section to force system panic if we have trouble.
                               6497                 :      */
 7862 tgl                      6498            2340 :     START_CRIT_SECTION();
                               6499                 : 
 8595 vadim4o                  6500            2340 :     if (shutdown)
                               6501                 :     {
 5163 heikki.linnakangas       6502             967 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 8595 vadim4o                  6503             967 :         ControlFile->state = DB_SHUTDOWNING;
                               6504             967 :         UpdateControlFile();
 5163 heikki.linnakangas       6505             967 :         LWLockRelease(ControlFileLock);
                               6506                 :     }
                               6507                 : 
                               6508                 :     /* Begin filling in the checkpoint WAL record */
 7262 tgl                      6509           28080 :     MemSet(&checkPoint, 0, sizeof(checkPoint));
 5530 tgl                      6510 CBC        2340 :     checkPoint.time = (pg_time_t) time(NULL);
                               6511                 : 
                               6512                 :     /*
                               6513                 :      * For Hot Standby, derive the oldestActiveXid before we fix the redo
                               6514                 :      * pointer. This allows us to begin accumulating changes to assemble our
                               6515                 :      * starting snapshot of locks and transactions.
                               6516                 :      */
 4176 simon                    6517 GIC        2340 :     if (!shutdown && XLogStandbyInfoActive())
                               6518            1353 :         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
                               6519                 :     else
                               6520             987 :         checkPoint.oldestActiveXid = InvalidTransactionId;
                               6521                 : 
                               6522                 :     /*
                               6523                 :      * Get location of last important record before acquiring insert locks (as
                               6524                 :      * GetLastImportantRecPtr() also locks WAL locks).
                               6525                 :      */
 2299 andres                   6526            2340 :     last_important_lsn = GetLastImportantRecPtr();
                               6527                 : 
                               6528                 :     /*
                               6529                 :      * We must block concurrent insertions while examining insert state to
                               6530                 :      * determine the checkpoint REDO pointer.
                               6531                 :      */
 3306 heikki.linnakangas       6532            2340 :     WALInsertLockAcquireExclusive();
 3562                          6533            2340 :     curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
 8062 tgl                      6534 ECB             : 
                               6535                 :     /*
 2299 andres                   6536                 :      * If this isn't a shutdown or forced checkpoint, and if there has been no
                               6537                 :      * WAL activity requiring a checkpoint, skip it.  The idea here is to
                               6538                 :      * avoid inserting duplicate checkpoints when the system is idle.
 8062 tgl                      6539                 :      */
 5036 heikki.linnakangas       6540 CBC        2340 :     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
                               6541                 :                   CHECKPOINT_FORCE)) == 0)
                               6542                 :     {
 2299 andres                   6543 GIC          28 :         if (last_important_lsn == ControlFile->checkPoint)
                               6544                 :         {
 3306 heikki.linnakangas       6545               5 :             WALInsertLockRelease();
 8062 tgl                      6546               5 :             END_CRIT_SECTION();
 2299 andres                   6547               5 :             ereport(DEBUG1,
  781 peter                    6548 ECB             :                     (errmsg_internal("checkpoint skipped because system is idle")));
 8062 tgl                      6549 GIC           5 :             return;
                               6550                 :         }
                               6551                 :     }
                               6552                 : 
                               6553                 :     /*
                               6554                 :      * An end-of-recovery checkpoint is created before anyone is allowed to
                               6555                 :      * write WAL. To allow us to write the checkpoint record, temporarily
                               6556                 :      * enable XLogInsertAllowed.
 4973 heikki.linnakangas       6557 ECB             :      */
 4973 heikki.linnakangas       6558 CBC        2335 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
  531 rhaas                    6559 GIC          27 :         oldXLogAllowed = LocalSetXLogInsertAllowed();
 4973 heikki.linnakangas       6560 ECB             : 
  515 rhaas                    6561 CBC        2335 :     checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
 3709 heikki.linnakangas       6562 GIC        2335 :     if (flags & CHECKPOINT_END_OF_RECOVERY)
                               6563              27 :         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
                               6564                 :     else
  520 rhaas                    6565            2308 :         checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
 3709 heikki.linnakangas       6566 ECB             : 
 4092 simon                    6567 CBC        2335 :     checkPoint.fullPageWrites = Insert->fullPageWrites;
 4973 heikki.linnakangas       6568 ECB             : 
                               6569                 :     /*
 8062 tgl                      6570                 :      * Compute new REDO record ptr = location of next XLOG record.
                               6571                 :      *
                               6572                 :      * NB: this is NOT necessarily where the checkpoint record itself will be,
                               6573                 :      * since other backends may insert more XLOG records while we're off doing
                               6574                 :      * the buffer flush work.  Those XLOG records are logically after the
                               6575                 :      * checkpoint, even though physically before it.  Got that?
                               6576                 :      */
 3562 heikki.linnakangas       6577 GIC        2335 :     freespace = INSERT_FREESPACE(curInsert);
 3941                          6578            2335 :     if (freespace == 0)
                               6579                 :     {
 2028 andres                   6580 UIC           0 :         if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
 3562 heikki.linnakangas       6581               0 :             curInsert += SizeOfXLogLongPHD;
                               6582                 :         else
                               6583               0 :             curInsert += SizeOfXLogShortPHD;
                               6584                 :     }
 3562 heikki.linnakangas       6585 GIC        2335 :     checkPoint.redo = curInsert;
                               6586                 : 
                               6587                 :     /*
                               6588                 :      * Here we update the shared RedoRecPtr for future XLogInsert calls; this
                               6589                 :      * must be done while holding all the insertion locks.
                               6590                 :      *
                               6591                 :      * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
                               6592                 :      * pointing past where it really needs to point.  This is okay; the only
                               6593                 :      * consequence is that XLogInsert might back up whole buffers that it
                               6594                 :      * didn't really need to.  We can't postpone advancing RedoRecPtr because
                               6595                 :      * XLogInserts that happen while we are dumping buffers must assume that
                               6596                 :      * their buffer changes are not included in the checkpoint.
                               6597                 :      */
 3121 andres                   6598            2335 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
                               6599                 : 
                               6600                 :     /*
                               6601                 :      * Now we can release the WAL insertion locks, allowing other xacts to
                               6602                 :      * proceed while we are flushing disk buffers.
                               6603                 :      */
 3306 heikki.linnakangas       6604            2335 :     WALInsertLockRelease();
 3562 heikki.linnakangas       6605 ECB             : 
                               6606                 :     /* Update the info_lck-protected copy of RedoRecPtr as well */
 3121 andres                   6607 GIC        2335 :     SpinLockAcquire(&XLogCtl->info_lck);
                               6608            2335 :     XLogCtl->RedoRecPtr = checkPoint.redo;
                               6609            2335 :     SpinLockRelease(&XLogCtl->info_lck);
                               6610                 : 
 5762 tgl                      6611 ECB             :     /*
                               6612                 :      * If enabled, log checkpoint start.  We postpone this until now so as not
                               6613                 :      * to log anything if we decided to skip the checkpoint.
                               6614                 :      */
 5762 tgl                      6615 GIC        2335 :     if (log_checkpoints)
 5163 heikki.linnakangas       6616             513 :         LogCheckpointStart(flags, false);
                               6617                 : 
  846 michael                  6618 ECB             :     /* Update the process title */
  846 michael                  6619 GIC        2335 :     update_checkpoint_display(flags, false, false);
                               6620                 : 
                               6621                 :     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
                               6622                 : 
                               6623                 :     /*
 3292 heikki.linnakangas       6624 ECB             :      * Get the other info we need for the checkpoint record.
 2208 rhaas                    6625                 :      *
                               6626                 :      * We don't need to save oldestClogXid in the checkpoint, it only matters
                               6627                 :      * for the short period in which clog is being truncated, and if we crash
                               6628                 :      * during that we'll redo the clog truncation and fix up oldestClogXid
                               6629                 :      * there.
 3292 heikki.linnakangas       6630                 :      */
 3292 heikki.linnakangas       6631 GBC        2335 :     LWLockAcquire(XidGenLock, LW_SHARED);
  971 andres                   6632 GIC        2335 :     checkPoint.nextXid = ShmemVariableCache->nextXid;
 3292 heikki.linnakangas       6633            2335 :     checkPoint.oldestXid = ShmemVariableCache->oldestXid;
                               6634            2335 :     checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
                               6635            2335 :     LWLockRelease(XidGenLock);
                               6636                 : 
 3049 alvherre                 6637            2335 :     LWLockAcquire(CommitTsLock, LW_SHARED);
 2659 mail                     6638            2335 :     checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
                               6639            2335 :     checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
 3049 alvherre                 6640 CBC        2335 :     LWLockRelease(CommitTsLock);
 3049 alvherre                 6641 ECB             : 
 3292 heikki.linnakangas       6642 GIC        2335 :     LWLockAcquire(OidGenLock, LW_SHARED);
                               6643            2335 :     checkPoint.nextOid = ShmemVariableCache->nextOid;
                               6644            2335 :     if (!shutdown)
                               6645            1368 :         checkPoint.nextOid += ShmemVariableCache->oidCount;
                               6646            2335 :     LWLockRelease(OidGenLock);
                               6647                 : 
                               6648            2335 :     MultiXactGetCheckptMulti(shutdown,
 3292 heikki.linnakangas       6649 ECB             :                              &checkPoint.nextMulti,
                               6650                 :                              &checkPoint.nextMultiOffset,
                               6651                 :                              &checkPoint.oldestMulti,
                               6652                 :                              &checkPoint.oldestMultiDB);
                               6653                 : 
                               6654                 :     /*
                               6655                 :      * Having constructed the checkpoint record, ensure all shmem disk buffers
                               6656                 :      * and commit-log buffers are flushed to disk.
                               6657                 :      *
                               6658                 :      * This I/O could fail for various reasons.  If so, we will fail to
                               6659                 :      * complete the checkpoint, but there is no reason to force a system
                               6660                 :      * panic. Accordingly, exit critical section while doing it.
                               6661                 :      */
 3292 heikki.linnakangas       6662 GIC        2335 :     END_CRIT_SECTION();
                               6663                 : 
                               6664                 :     /*
 3602 bruce                    6665 ECB             :      * In some cases there are groups of actions that must all occur on one
                               6666                 :      * side or the other of a checkpoint record. Before flushing the
                               6667                 :      * checkpoint record we must explicitly wait for any backend currently
                               6668                 :      * performing those groups of actions.
                               6669                 :      *
                               6670                 :      * One example is end of transaction, so we must wait for any transactions
                               6671                 :      * that are currently in commit critical sections.  If an xact inserted
                               6672                 :      * its commit record into XLOG just before the REDO point, then a crash
 5850 tgl                      6673                 :      * restart from the REDO point would not replay that record, which means
 2214 rhaas                    6674                 :      * that our flushing had better include the xact's update of pg_xact.  So
                               6675                 :      * we wait till he's out of his commit critical section before proceeding.
 5850 tgl                      6676                 :      * See notes in RecordTransactionCommit().
                               6677                 :      *
                               6678                 :      * Because we've already released the insertion locks, this test is a bit
                               6679                 :      * fuzzy: it is possible that we will wait for xacts we didn't really need
                               6680                 :      * to wait for.  But the delay should be short and it seems better to make
                               6681                 :      * checkpoint take a bit longer than to hold off insertions longer than
 3260 bruce                    6682                 :      * necessary. (In fact, the whole reason we have this issue is that xact.c
                               6683                 :      * does commit record XLOG insertion and clog update as two separate steps
                               6684                 :      * protected by different locks, but again that seems best on grounds of
                               6685                 :      * minimizing lock contention.)
                               6686                 :      *
                               6687                 :      * A transaction that has not yet set delayChkptFlags when we look cannot
  366 rhaas                    6688                 :      * be at risk, since it has not inserted its commit record yet; and one
                               6689                 :      * that's already cleared it is not at risk either, since it's done fixing
                               6690                 :      * clog and we will correctly flush the update below.  So we cannot miss
                               6691                 :      * any xacts we need to wait for.
                               6692                 :      */
  381 rhaas                    6693 GIC        2335 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
 3779 simon                    6694            2335 :     if (nvxids > 0)
                               6695                 :     {
 5624 bruce                    6696 ECB             :         do
                               6697                 :         {
 5624 bruce                    6698 GIC           9 :             pg_usleep(10000L);  /* wait for 10 msec */
  381 rhaas                    6699 CBC           9 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
                               6700                 :                                               DELAY_CHKPT_START));
 5850 tgl                      6701 ECB             :     }
 3779 simon                    6702 CBC        2335 :     pfree(vxids);
 5850 tgl                      6703 ECB             : 
 5764 tgl                      6704 GIC        2335 :     CheckPointGuts(checkPoint.redo, flags);
 7897 tgl                      6705 ECB             : 
  381 rhaas                    6706 GIC        2335 :     vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
                               6707            2335 :     if (nvxids > 0)
                               6708                 :     {
                               6709                 :         do
                               6710                 :         {
  381 rhaas                    6711 UIC           0 :             pg_usleep(10000L);  /* wait for 10 msec */
                               6712               0 :         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
                               6713                 :                                               DELAY_CHKPT_COMPLETE));
  381 rhaas                    6714 ECB             :     }
  381 rhaas                    6715 CBC        2335 :     pfree(vxids);
                               6716                 : 
 4859 simon                    6717 ECB             :     /*
 4790 bruce                    6718                 :      * Take a snapshot of running transactions and write this to WAL. This
                               6719                 :      * allows us to reconstruct the state of running transactions during
                               6720                 :      * archive recovery, if required. Skip, if this info disabled.
 4859 simon                    6721                 :      *
                               6722                 :      * If we are shutting down, or Startup process is completing crash
                               6723                 :      * recovery we don't need to write running xact data.
                               6724                 :      */
 4859 simon                    6725 GIC        2335 :     if (!shutdown && XLogStandbyInfoActive())
 3780 tgl                      6726            1348 :         LogStandbySnapshot();
                               6727                 : 
 7274                          6728            2335 :     START_CRIT_SECTION();
                               6729                 : 
                               6730                 :     /*
                               6731                 :      * Now insert the checkpoint record into XLOG.
                               6732                 :      */
 3062 heikki.linnakangas       6733 CBC        2335 :     XLogBeginInsert();
                               6734            2335 :     XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
 8062 tgl                      6735 GIC        2335 :     recptr = XLogInsert(RM_XLOG_ID,
 8062 tgl                      6736 EUB             :                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
 3062 heikki.linnakangas       6737                 :                         XLOG_CHECKPOINT_ONLINE);
                               6738                 : 
 8062 tgl                      6739 GBC        2335 :     XLogFlush(recptr);
                               6740                 : 
 5035 tgl                      6741 ECB             :     /*
                               6742                 :      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
                               6743                 :      * overwritten at next startup.  No-one should even try, this just allows
                               6744                 :      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
                               6745                 :      * to just temporarily disable writing until the system has exited
                               6746                 :      * recovery.
                               6747                 :      */
 5035 tgl                      6748 GIC        2335 :     if (shutdown)
                               6749                 :     {
                               6750             967 :         if (flags & CHECKPOINT_END_OF_RECOVERY)
  531 rhaas                    6751              27 :             LocalXLogInsertAllowed = oldXLogAllowed;
                               6752                 :         else
 4790 bruce                    6753             940 :             LocalXLogInsertAllowed = 0; /* never again write WAL */
 5035 tgl                      6754 ECB             :     }
                               6755                 : 
                               6756                 :     /*
                               6757                 :      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
                               6758                 :      * = end of actual checkpoint record.
                               6759                 :      */
 3754 alvherre                 6760 CBC        2335 :     if (shutdown && checkPoint.redo != ProcLastRecPtr)
 7202 tgl                      6761 UIC           0 :         ereport(PANIC,
                               6762                 :                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
 8595 vadim4o                  6763 ECB             : 
 8062 tgl                      6764                 :     /*
 1957 rhaas                    6765                 :      * Remember the prior checkpoint's redo ptr for
                               6766                 :      * UpdateCheckPointDistanceEstimate()
                               6767                 :      */
 2967 heikki.linnakangas       6768 GIC        2335 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
                               6769                 : 
                               6770                 :     /*
 8062 tgl                      6771 ECB             :      * Update the control file.
                               6772                 :      */
 7862 tgl                      6773 GIC        2335 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 8595 vadim4o                  6774            2335 :     if (shutdown)
 8595 vadim4o                  6775 CBC         967 :         ControlFile->state = DB_SHUTDOWNED;
 8062 tgl                      6776 GIC        2335 :     ControlFile->checkPoint = ProcLastRecPtr;
                               6777            2335 :     ControlFile->checkPointCopy = checkPoint;
                               6778                 :     /* crash recovery should always recover to the end of WAL */
 3755 alvherre                 6779            2335 :     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 3778 heikki.linnakangas       6780            2335 :     ControlFile->minRecoveryPointTLI = 0;
                               6781                 : 
                               6782                 :     /*
                               6783                 :      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
                               6784                 :      * unused on non-shutdown checkpoints, but seems useful to store it always
                               6785                 :      * for debugging purposes.
                               6786                 :      */
 3709 heikki.linnakangas       6787 CBC        2335 :     SpinLockAcquire(&XLogCtl->ulsn_lck);
                               6788            2335 :     ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
                               6789            2335 :     SpinLockRelease(&XLogCtl->ulsn_lck);
 3709 heikki.linnakangas       6790 ECB             : 
 8595 vadim4o                  6791 CBC        2335 :     UpdateControlFile();
 7862 tgl                      6792 GIC        2335 :     LWLockRelease(ControlFileLock);
 8595 vadim4o                  6793 ECB             : 
 6075 tgl                      6794                 :     /* Update shared-memory copy of checkpoint XID/epoch */
 3121 andres                   6795 CBC        2335 :     SpinLockAcquire(&XLogCtl->info_lck);
  971                          6796            2335 :     XLogCtl->ckptFullXid = checkPoint.nextXid;
 3121 andres                   6797 GIC        2335 :     SpinLockRelease(&XLogCtl->info_lck);
 6075 tgl                      6798 ECB             : 
 7274                          6799                 :     /*
 6385 bruce                    6800                 :      * We are now done with critical updates; no need for system panic if we
 5764 tgl                      6801                 :      * have trouble while fooling with old log segments.
 7274                          6802                 :      */
 7274 tgl                      6803 GIC        2335 :     END_CRIT_SECTION();
 7274 tgl                      6804 ECB             : 
                               6805                 :     /*
                               6806                 :      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
                               6807                 :      */
 1466 tmunro                   6808 GIC        2335 :     SyncPostCheckpoint();
                               6809                 : 
                               6810                 :     /*
                               6811                 :      * Update the average distance between checkpoints if the prior checkpoint
                               6812                 :      * exists.
                               6813                 :      */
 2967 heikki.linnakangas       6814            2335 :     if (PriorRedoPtr != InvalidXLogRecPtr)
                               6815            2335 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
                               6816                 : 
                               6817                 :     /*
 1720 michael                  6818 ECB             :      * Delete old log files, those no longer needed for last checkpoint to
                               6819                 :      * prevent the disk holding the xlog from growing full.
                               6820                 :      */
 1720 michael                  6821 GIC        2335 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
                               6822            2335 :     KeepLogSeg(recptr, &_logSegNo);
    2 andres                   6823 GNC        2335 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
                               6824                 :                                            _logSegNo, InvalidOid,
                               6825                 :                                            InvalidTransactionId))
                               6826                 :     {
                               6827                 :         /*
                               6828                 :          * Some slots have been invalidated; recalculate the old-segment
                               6829                 :          * horizon, starting again from RedoRecPtr.
                               6830                 :          */
  632 alvherre                 6831 GIC           3 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
                               6832               3 :         KeepLogSeg(recptr, &_logSegNo);
                               6833                 :     }
 1720 michael                  6834            2335 :     _logSegNo--;
  520 rhaas                    6835            2335 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
                               6836                 :                        checkPoint.ThisTimeLineID);
                               6837                 : 
                               6838                 :     /*
                               6839                 :      * Make more log segments if needed.  (Do this after recycling old log
                               6840                 :      * segments, since that may supply some of the needed files.)
                               6841                 :      */
 8062 tgl                      6842            2335 :     if (!shutdown)
  520 rhaas                    6843            1368 :         PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
                               6844                 : 
                               6845                 :     /*
                               6846                 :      * Truncate pg_subtrans if possible.  We can throw away all data before
                               6847                 :      * the oldest XMIN of any running transaction.  No future transaction will
                               6848                 :      * attempt to reference any pg_subtrans entry older than that (see Asserts
                               6849                 :      * in subtrans.c).  During recovery, though, we mustn't do this because
                               6850                 :      * StartupSUBTRANS hasn't been called yet.
 6803 tgl                      6851 ECB             :      */
 5035 tgl                      6852 CBC        2335 :     if (!RecoveryInProgress())
  970 andres                   6853 GIC        2308 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
                               6854                 : 
                               6855                 :     /* Real work is done; log and update stats. */
 4021 rhaas                    6856 CBC        2335 :     LogCheckpointEnd(false);
 6736 tgl                      6857 ECB             : 
                               6858                 :     /* Reset the process title */
  846 michael                  6859 GIC        2335 :     update_checkpoint_display(flags, false, true);
  846 michael                  6860 ECB             : 
                               6861                 :     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
 5142 tgl                      6862                 :                                      NBuffers,
                               6863                 :                                      CheckpointStats.ckpt_segs_added,
                               6864                 :                                      CheckpointStats.ckpt_segs_removed,
                               6865                 :                                      CheckpointStats.ckpt_segs_recycled);
                               6866                 : }
                               6867                 : 
                               6868                 : /*
 3722 simon                    6869 EUB             :  * Mark the end of recovery in WAL though without running a full checkpoint.
                               6870                 :  * We can expect that a restartpoint is likely to be in progress as we
                               6871                 :  * do this, though we are unwilling to wait for it to complete.
                               6872                 :  *
 3722 simon                    6873 ECB             :  * CreateRestartPoint() allows for the case where recovery may end before
                               6874                 :  * the restartpoint completes so there is no concern of concurrent behaviour.
                               6875                 :  */
                               6876                 : static void
 3722 simon                    6877 GIC          36 : CreateEndOfRecoveryRecord(void)
                               6878                 : {
                               6879                 :     xl_end_of_recovery xlrec;
                               6880                 :     XLogRecPtr  recptr;
                               6881                 : 
                               6882                 :     /* sanity check */
 3722 simon                    6883 CBC          36 :     if (!RecoveryInProgress())
 3722 simon                    6884 LBC           0 :         elog(ERROR, "can only be used to end recovery");
                               6885                 : 
 3033 heikki.linnakangas       6886 CBC          36 :     xlrec.end_time = GetCurrentTimestamp();
                               6887                 : 
 3306 heikki.linnakangas       6888 GIC          36 :     WALInsertLockAcquireExclusive();
  515 rhaas                    6889              36 :     xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
 3709 heikki.linnakangas       6890              36 :     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
 3306 heikki.linnakangas       6891 CBC          36 :     WALInsertLockRelease();
 3722 simon                    6892 ECB             : 
 3722 simon                    6893 CBC          36 :     START_CRIT_SECTION();
                               6894                 : 
 3062 heikki.linnakangas       6895 GIC          36 :     XLogBeginInsert();
                               6896              36 :     XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
 3062 heikki.linnakangas       6897 CBC          36 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
                               6898                 : 
 3720 simon                    6899 GIC          36 :     XLogFlush(recptr);
                               6900                 : 
                               6901                 :     /*
                               6902                 :      * Update the control file so that crash recovery can follow the timeline
                               6903                 :      * changes to this point.
                               6904                 :      */
                               6905              36 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 3720 simon                    6906 CBC          36 :     ControlFile->minRecoveryPoint = recptr;
  520 rhaas                    6907 GIC          36 :     ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
 3720 simon                    6908 CBC          36 :     UpdateControlFile();
                               6909              36 :     LWLockRelease(ControlFileLock);
                               6910                 : 
 3722                          6911              36 :     END_CRIT_SECTION();
 3722 simon                    6912 GIC          36 : }
                               6913                 : 
                               6914                 : /*
                               6915                 :  * Write an OVERWRITE_CONTRECORD message.
                               6916                 :  *
                               6917                 :  * When on WAL replay we expect a continuation record at the start of a page
  557 alvherre                 6918 ECB             :  * that is not there, recovery ends and WAL writing resumes at that point.
  557 alvherre                 6919 EUB             :  * But it's wrong to resume writing new WAL back at the start of the record
                               6920                 :  * that was broken, because downstream consumers of that WAL (physical
                               6921                 :  * replicas) are not prepared to "rewind".  So the first action after
                               6922                 :  * finishing replay of all valid WAL must be to write a record of this type
                               6923                 :  * at the point where the contrecord was missing; to support xlogreader
                               6924                 :  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
                               6925                 :  * to the page header where the record occurs.  xlogreader has an ad-hoc
  557 alvherre                 6926 ECB             :  * mechanism to report metadata about the broken record, which is what we
                               6927                 :  * use here.
                               6928                 :  *
                               6929                 :  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
                               6930                 :  * skip the record it was reading, and pass back the LSN of the skipped
                               6931                 :  * record, so that its caller can verify (on "replay" of that record) that the
                               6932                 :  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
  417 heikki.linnakangas       6933                 :  *
                               6934                 :  * 'aborted_lsn' is the beginning position of the record that was incomplete.
                               6935                 :  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
                               6936                 :  * beginning of the XLOG page where the record is to be inserted.  They must
                               6937                 :  * match the current WAL insert position, they're passed here just so that we
                               6938                 :  * can verify that.
                               6939                 :  */
                               6940                 : static XLogRecPtr
  417 heikki.linnakangas       6941 GIC           1 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
                               6942                 :                                 TimeLineID newTLI)
                               6943                 : {
                               6944                 :     xl_overwrite_contrecord xlrec;
  557 alvherre                 6945 ECB             :     XLogRecPtr  recptr;
  417 heikki.linnakangas       6946                 :     XLogPageHeader pagehdr;
                               6947                 :     XLogRecPtr  startPos;
                               6948                 : 
                               6949                 :     /* sanity checks */
  557 alvherre                 6950 CBC           1 :     if (!RecoveryInProgress())
  557 alvherre                 6951 UIC           0 :         elog(ERROR, "can only be used at end of recovery");
  417 heikki.linnakangas       6952 GIC           1 :     if (pagePtr % XLOG_BLCKSZ != 0)
  417 heikki.linnakangas       6953 LBC           0 :         elog(ERROR, "invalid position for missing continuation record %X/%X",
  417 heikki.linnakangas       6954 ECB             :              LSN_FORMAT_ARGS(pagePtr));
                               6955                 : 
                               6956                 :     /* The current WAL insert position should be right after the page header */
  417 heikki.linnakangas       6957 GIC           1 :     startPos = pagePtr;
                               6958               1 :     if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
                               6959               1 :         startPos += SizeOfXLogLongPHD;
                               6960                 :     else
  417 heikki.linnakangas       6961 LBC           0 :         startPos += SizeOfXLogShortPHD;
  417 heikki.linnakangas       6962 GIC           1 :     recptr = GetXLogInsertRecPtr();
                               6963               1 :     if (recptr != startPos)
  417 heikki.linnakangas       6964 UIC           0 :         elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
                               6965                 :              LSN_FORMAT_ARGS(recptr));
  557 alvherre                 6966 ECB             : 
  557 alvherre                 6967 GIC           1 :     START_CRIT_SECTION();
                               6968                 : 
                               6969                 :     /*
                               6970                 :      * Initialize the XLOG page header (by GetXLogBuffer), and set the
                               6971                 :      * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
  417 heikki.linnakangas       6972 ECB             :      *
                               6973                 :      * No other backend is allowed to write WAL yet, so acquiring the WAL
                               6974                 :      * insertion lock is just pro forma.
                               6975                 :      */
  417 heikki.linnakangas       6976 GIC           1 :     WALInsertLockAcquire();
                               6977               1 :     pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
                               6978               1 :     pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
  417 heikki.linnakangas       6979 CBC           1 :     WALInsertLockRelease();
  417 heikki.linnakangas       6980 ECB             : 
                               6981                 :     /*
                               6982                 :      * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
                               6983                 :      * page.  We know it becomes the first record, because no other backend is
                               6984                 :      * allowed to write WAL yet.
                               6985                 :      */
  557 alvherre                 6986 GIC           1 :     XLogBeginInsert();
  417 heikki.linnakangas       6987               1 :     xlrec.overwritten_lsn = aborted_lsn;
                               6988               1 :     xlrec.overwrite_time = GetCurrentTimestamp();
  557 alvherre                 6989 CBC           1 :     XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
                               6990               1 :     recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
                               6991                 : 
  417 heikki.linnakangas       6992 ECB             :     /* check that the record was inserted to the right place */
  417 heikki.linnakangas       6993 CBC           1 :     if (ProcLastRecPtr != startPos)
  417 heikki.linnakangas       6994 UIC           0 :         elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
                               6995                 :              LSN_FORMAT_ARGS(ProcLastRecPtr));
                               6996                 : 
  557 alvherre                 6997 GIC           1 :     XLogFlush(recptr);
                               6998                 : 
                               6999               1 :     END_CRIT_SECTION();
  557 alvherre                 7000 ECB             : 
  557 alvherre                 7001 CBC           1 :     return recptr;
                               7002                 : }
                               7003                 : 
                               7004                 : /*
                               7005                 :  * Flush all data in shared memory to disk, and fsync
                               7006                 :  *
                               7007                 :  * This is the common code shared between regular checkpoints and
                               7008                 :  * recovery restartpoints.
                               7009                 :  */
 6089 tgl                      7010 ECB             : static void
 5764 tgl                      7011 CBC        2363 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
                               7012                 : {
 4809 tgl                      7013 GIC        2363 :     CheckPointRelationMap();
 3355 rhaas                    7014 CBC        2363 :     CheckPointReplicationSlots();
 3324 rhaas                    7015 GIC        2363 :     CheckPointSnapBuild();
                               7016            2363 :     CheckPointLogicalRewriteHeap();
 2902 andres                   7017 CBC        2363 :     CheckPointReplicationOrigin();
                               7018                 : 
                               7019                 :     /* Write out all dirty data in SLRUs and the main buffer pool */
                               7020                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
  926 tmunro                   7021 GIC        2363 :     CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
                               7022            2363 :     CheckPointCLOG();
                               7023            2363 :     CheckPointCommitTs();
                               7024            2363 :     CheckPointSUBTRANS();
                               7025            2363 :     CheckPointMultiXact();
                               7026            2363 :     CheckPointPredicate();
                               7027            2363 :     CheckPointBuffers(flags);
                               7028                 : 
                               7029                 :     /* Perform all queued up fsyncs */
                               7030                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
                               7031            2363 :     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
                               7032            2363 :     ProcessSyncRequests();
                               7033            2363 :     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
                               7034                 :     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
  926 tmunro                   7035 ECB             : 
                               7036                 :     /* We deliberately delay 2PC checkpointing as long as possible */
 6089 tgl                      7037 GIC        2363 :     CheckPointTwoPhase(checkPointRedo);
                               7038            2363 : }
                               7039                 : 
                               7040                 : /*
 5035 tgl                      7041 ECB             :  * Save a checkpoint for recovery restart if appropriate
 5035 tgl                      7042 EUB             :  *
                               7043                 :  * This function is called each time a checkpoint record is read from XLOG.
 5035 tgl                      7044 ECB             :  * It must determine whether the checkpoint represents a safe restartpoint or
                               7045                 :  * not.  If so, the checkpoint record is stashed in shared memory so that
                               7046                 :  * CreateRestartPoint can consult it.  (Note that the latter function is
 4176 simon                    7047                 :  * executed by the checkpointer, while this one will be executed by the
                               7048                 :  * startup process.)
 6089 tgl                      7049                 :  */
                               7050                 : static void
  501 rhaas                    7051 CBC         166 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
                               7052                 : {
 4146 heikki.linnakangas       7053 ECB             :     /*
 3955 bruce                    7054                 :      * Also refrain from creating a restartpoint if we have seen any
                               7055                 :      * references to non-existent pages. Restarting recovery from the
                               7056                 :      * restartpoint would not see the references, so we would lose the
                               7057                 :      * cross-check that the pages belonged to a relation that was dropped
                               7058                 :      * later.
                               7059                 :      */
 4146 heikki.linnakangas       7060 GIC         166 :     if (XLogHaveInvalidPages())
                               7061                 :     {
 4146 heikki.linnakangas       7062 UIC           0 :         elog(trace_recovery(DEBUG2),
 4146 heikki.linnakangas       7063 ECB             :              "could not record restart point at %X/%X because there "
                               7064                 :              "are unresolved references to invalid pages",
  775 peter                    7065                 :              LSN_FORMAT_ARGS(checkPoint->redo));
 4146 heikki.linnakangas       7066 LBC           0 :         return;
 4146 heikki.linnakangas       7067 ECB             :     }
                               7068                 : 
 6089 tgl                      7069                 :     /*
 3955 bruce                    7070                 :      * Copy the checkpoint record to shared memory, so that checkpointer can
                               7071                 :      * work out the next time it wants to perform a restartpoint.
                               7072                 :      */
 3121 andres                   7073 GIC         166 :     SpinLockAcquire(&XLogCtl->info_lck);
  501 rhaas                    7074             166 :     XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
                               7075             166 :     XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
 3121 andres                   7076             166 :     XLogCtl->lastCheckPoint = *checkPoint;
                               7077             166 :     SpinLockRelease(&XLogCtl->info_lck);
                               7078                 : }
                               7079                 : 
                               7080                 : /*
                               7081                 :  * Establish a restartpoint if possible.
                               7082                 :  *
                               7083                 :  * This is similar to CreateCheckPoint, but is used during WAL recovery
                               7084                 :  * to establish a point from which recovery can roll forward without
                               7085                 :  * replaying the entire recovery log.
                               7086                 :  *
                               7087                 :  * Returns true if a new restartpoint was established. We can only establish
                               7088                 :  * a restartpoint if we have replayed a safe checkpoint record since last
                               7089                 :  * restartpoint.
                               7090                 :  */
                               7091                 : bool
 5163 heikki.linnakangas       7092              71 : CreateRestartPoint(int flags)
                               7093                 : {
                               7094                 :     XLogRecPtr  lastCheckPointRecPtr;
                               7095                 :     XLogRecPtr  lastCheckPointEndPtr;
                               7096                 :     CheckPoint  lastCheckPoint;
                               7097                 :     XLogRecPtr  PriorRedoPtr;
                               7098                 :     XLogRecPtr  receivePtr;
 1720 michael                  7099 ECB             :     XLogRecPtr  replayPtr;
                               7100                 :     TimeLineID  replayTLI;
                               7101                 :     XLogRecPtr  endptr;
                               7102                 :     XLogSegNo   _logSegNo;
                               7103                 :     TimestampTz xtime;
                               7104                 : 
                               7105                 :     /* Concurrent checkpoint/restartpoint cannot happen */
  335 michael                  7106 GIC          71 :     Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
                               7107                 : 
 5035 tgl                      7108 ECB             :     /* Get a local copy of the last safe checkpoint record. */
 3121 andres                   7109 GBC          71 :     SpinLockAcquire(&XLogCtl->info_lck);
 3121 andres                   7110 CBC          71 :     lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
 2355 rhaas                    7111 GBC          71 :     lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
 3121 andres                   7112 GIC          71 :     lastCheckPoint = XLogCtl->lastCheckPoint;
                               7113              71 :     SpinLockRelease(&XLogCtl->info_lck);
                               7114                 : 
 5050 bruce                    7115 ECB             :     /*
 5163 heikki.linnakangas       7116                 :      * Check that we're still in recovery mode. It's ok if we exit recovery
                               7117                 :      * mode after this check, the restart point is valid anyway.
                               7118                 :      */
 5163 heikki.linnakangas       7119 GBC          71 :     if (!RecoveryInProgress())
 5163 heikki.linnakangas       7120 ECB             :     {
 5163 heikki.linnakangas       7121 LBC           0 :         ereport(DEBUG2,
  781 peter                    7122 EUB             :                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
 5163 heikki.linnakangas       7123 UIC           0 :         return false;
                               7124                 :     }
 5163 heikki.linnakangas       7125 ECB             : 
                               7126                 :     /*
                               7127                 :      * If the last checkpoint record we've replayed is already our last
                               7128                 :      * restartpoint, we can't perform a new restart point. We still update
                               7129                 :      * minRecoveryPoint in that case, so that if this is a shutdown restart
                               7130                 :      * point, we won't start up earlier than before. That's not strictly
                               7131                 :      * necessary, but when hot standby is enabled, it would be rather weird if
                               7132                 :      * the database opened up for read-only connections at a point-in-time
                               7133                 :      * before the last shutdown. Such time travel is still possible in case of
 4660 bruce                    7134                 :      * immediate shutdown, though.
 5163 heikki.linnakangas       7135                 :      *
                               7136                 :      * We don't explicitly advance minRecoveryPoint when we do create a
 5050 bruce                    7137                 :      * restartpoint. It's assumed that flushing the buffers will do that as a
                               7138                 :      * side-effect.
                               7139                 :      */
 5163 heikki.linnakangas       7140 GIC          71 :     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
 3754 alvherre                 7141              69 :         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
                               7142                 :     {
 5163 heikki.linnakangas       7143              43 :         ereport(DEBUG2,
  781 peter                    7144 ECB             :                 (errmsg_internal("skipping restartpoint, already performed at %X/%X",
  775                          7145                 :                                  LSN_FORMAT_ARGS(lastCheckPoint.redo))));
 5163 heikki.linnakangas       7146                 : 
 5163 heikki.linnakangas       7147 CBC          43 :         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
 4693 rhaas                    7148              43 :         if (flags & CHECKPOINT_IS_SHUTDOWN)
                               7149                 :         {
 4693 rhaas                    7150 GIC          18 :             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 4693 rhaas                    7151 CBC          18 :             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
 4693 rhaas                    7152 GBC          18 :             UpdateControlFile();
 4693 rhaas                    7153 GIC          18 :             LWLockRelease(ControlFileLock);
                               7154                 :         }
 5163 heikki.linnakangas       7155 CBC          43 :         return false;
                               7156                 :     }
 5163 heikki.linnakangas       7157 ECB             : 
                               7158                 :     /*
 4660 bruce                    7159                 :      * Update the shared RedoRecPtr so that the startup process can calculate
                               7160                 :      * the number of segments replayed since last restartpoint, and request a
                               7161                 :      * restartpoint if it exceeds CheckPointSegments.
                               7162                 :      *
                               7163                 :      * Like in CreateCheckPoint(), hold off insertions to update it, although
                               7164                 :      * during recovery this is just pro forma, because no WAL insertions are
                               7165                 :      * happening.
                               7166                 :      */
 3306 heikki.linnakangas       7167 GIC          28 :     WALInsertLockAcquireExclusive();
 2967                          7168              28 :     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
 3306 heikki.linnakangas       7169 CBC          28 :     WALInsertLockRelease();
                               7170                 : 
 3562 heikki.linnakangas       7171 ECB             :     /* Also update the info_lck-protected copy */
 3121 andres                   7172 CBC          28 :     SpinLockAcquire(&XLogCtl->info_lck);
                               7173              28 :     XLogCtl->RedoRecPtr = lastCheckPoint.redo;
                               7174              28 :     SpinLockRelease(&XLogCtl->info_lck);
 4687 heikki.linnakangas       7175 ECB             : 
                               7176                 :     /*
                               7177                 :      * Prepare to accumulate statistics.
                               7178                 :      *
 4449 rhaas                    7179                 :      * Note: because it is possible for log_checkpoints to change while a
                               7180                 :      * checkpoint proceeds, we always accumulate stats, even if
                               7181                 :      * log_checkpoints is currently off.
                               7182                 :      */
 4449 rhaas                    7183 CBC         308 :     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
                               7184              28 :     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 5163 heikki.linnakangas       7185 ECB             : 
 4449 rhaas                    7186 GIC          28 :     if (log_checkpoints)
 5163 heikki.linnakangas       7187              28 :         LogCheckpointStart(flags, true);
                               7188                 : 
  846 michael                  7189 ECB             :     /* Update the process title */
  846 michael                  7190 CBC          28 :     update_checkpoint_display(flags, true, false);
  846 michael                  7191 ECB             : 
 5163 heikki.linnakangas       7192 GIC          28 :     CheckPointGuts(lastCheckPoint.redo, flags);
                               7193                 : 
                               7194                 :     /*
 1957 rhaas                    7195 ECB             :      * Remember the prior checkpoint's redo ptr for
                               7196                 :      * UpdateCheckPointDistanceEstimate()
                               7197                 :      */
 2967 heikki.linnakangas       7198 GIC          28 :     PriorRedoPtr = ControlFile->checkPointCopy.redo;
                               7199                 : 
                               7200                 :     /*
                               7201                 :      * Update pg_control, using current time.  Check that it still shows an
                               7202                 :      * older checkpoint, else do nothing; this is a quick hack to make sure
                               7203                 :      * nothing really bad happens if somehow we get here after the
                               7204                 :      * end-of-recovery checkpoint.
                               7205                 :      */
 5163                          7206              28 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  335 michael                  7207              28 :     if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
                               7208                 :     {
  335 michael                  7209 ECB             :         /*
                               7210                 :          * Update the checkpoint information.  We do this even if the cluster
                               7211                 :          * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
                               7212                 :          * segments recycled below.
                               7213                 :          */
 5035 tgl                      7214 GIC          28 :         ControlFile->checkPoint = lastCheckPointRecPtr;
                               7215              28 :         ControlFile->checkPointCopy = lastCheckPoint;
                               7216                 : 
                               7217                 :         /*
  335 michael                  7218 ECB             :          * Ensure minRecoveryPoint is past the checkpoint record and update it
                               7219                 :          * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
 2355 rhaas                    7220 EUB             :          * this will have happened already while writing out dirty buffers,
                               7221                 :          * but not necessarily - e.g. because no buffers were dirtied.  We do
                               7222                 :          * this because a backup performed in recovery uses minRecoveryPoint
                               7223                 :          * to determine which WAL files must be included in the backup, and
  335 michael                  7224                 :          * the file (or files) containing the checkpoint record must be
                               7225                 :          * included, at a minimum.  Note that for an ordinary restart of
                               7226                 :          * recovery there's no value in having the minimum recovery point any
                               7227                 :          * earlier than this anyway, because redo will begin just after the
                               7228                 :          * checkpoint record.
                               7229                 :          */
  335 michael                  7230 GIC          28 :         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
 2355 rhaas                    7231 ECB             :         {
  335 michael                  7232 CBC          28 :             if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
  335 michael                  7233 ECB             :             {
  335 michael                  7234 CBC           6 :                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
                               7235               6 :                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
                               7236                 : 
                               7237                 :                 /* update local copy */
  335 michael                  7238 GIC           6 :                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
                               7239               6 :                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                               7240                 :             }
                               7241              28 :             if (flags & CHECKPOINT_IS_SHUTDOWN)
                               7242              13 :                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
                               7243                 :         }
 5035 tgl                      7244              28 :         UpdateControlFile();
                               7245                 :     }
 5163 heikki.linnakangas       7246              28 :     LWLockRelease(ControlFileLock);
                               7247                 : 
                               7248                 :     /*
                               7249                 :      * Update the average distance between checkpoints/restartpoints if the
 1720 michael                  7250 ECB             :      * prior checkpoint exists.
                               7251                 :      */
 2967 heikki.linnakangas       7252 GIC          28 :     if (PriorRedoPtr != InvalidXLogRecPtr)
                               7253              28 :         UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
                               7254                 : 
                               7255                 :     /*
                               7256                 :      * Delete old log files, those no longer needed for last restartpoint to
                               7257                 :      * prevent the disk holding the xlog from growing full.
                               7258                 :      */
 1720 michael                  7259              28 :     XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
                               7260                 : 
                               7261                 :     /*
                               7262                 :      * Retreat _logSegNo using the current end of xlog replayed or received,
                               7263                 :      * whichever is later.
 1720 michael                  7264 ECB             :      */
 1096 tmunro                   7265 GIC          28 :     receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
 1720 michael                  7266              28 :     replayPtr = GetXLogReplayRecPtr(&replayTLI);
 1720 michael                  7267 CBC          28 :     endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
                               7268              28 :     KeepLogSeg(endptr, &_logSegNo);
    2 andres                   7269 GNC          28 :     if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
                               7270                 :                                            _logSegNo, InvalidOid,
                               7271                 :                                            InvalidTransactionId))
  632 alvherre                 7272 ECB             :     {
                               7273                 :         /*
                               7274                 :          * Some slots have been invalidated; recalculate the old-segment
                               7275                 :          * horizon, starting again from RedoRecPtr.
                               7276                 :          */
  632 alvherre                 7277 UIC           0 :         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
                               7278               0 :         KeepLogSeg(endptr, &_logSegNo);
  632 alvherre                 7279 ECB             :     }
 1720 michael                  7280 GIC          28 :     _logSegNo--;
 3762 heikki.linnakangas       7281 EUB             : 
                               7282                 :     /*
 1720 michael                  7283                 :      * Try to recycle segments on a useful timeline. If we've been promoted
                               7284                 :      * since the beginning of this restartpoint, use the new timeline chosen
                               7285                 :      * at end of recovery.  If we're still in recovery, use the timeline we're
                               7286                 :      * currently replaying.
                               7287                 :      *
                               7288                 :      * There is no guarantee that the WAL segments will be useful on the
                               7289                 :      * current timeline; if recovery proceeds to a new timeline right after
                               7290                 :      * this, the pre-allocated WAL segments on this timeline will not be used,
                               7291                 :      * and will go wasted until recycled on the next restartpoint. We'll live
                               7292                 :      * with that.
                               7293                 :      */
  520 rhaas                    7294 GIC          28 :     if (!RecoveryInProgress())
  515 rhaas                    7295 UIC           0 :         replayTLI = XLogCtl->InsertTimeLineID;
                               7296                 : 
  520 rhaas                    7297 GIC          28 :     RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
                               7298                 : 
                               7299                 :     /*
 1720 michael                  7300 ECB             :      * Make more log segments if needed.  (Do this after recycling old log
                               7301                 :      * segments, since that may supply some of the needed files.)
                               7302                 :      */
  520 rhaas                    7303 CBC          28 :     PreallocXlogFiles(endptr, replayTLI);
                               7304                 : 
                               7305                 :     /*
                               7306                 :      * Truncate pg_subtrans if possible.  We can throw away all data before
 3260 bruce                    7307 ECB             :      * the oldest XMIN of any running transaction.  No future transaction will
 4605 simon                    7308                 :      * attempt to reference any pg_subtrans entry older than that (see Asserts
                               7309                 :      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
                               7310                 :      * this because StartupSUBTRANS hasn't been called yet.
 5163 heikki.linnakangas       7311                 :      */
 4605 simon                    7312 CBC          28 :     if (EnableHotStandby)
  970 andres                   7313              28 :         TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
                               7314                 : 
  618 heikki.linnakangas       7315 ECB             :     /* Real work is done; log and update stats. */
 4021 rhaas                    7316 GIC          28 :     LogCheckpointEnd(true);
                               7317                 : 
                               7318                 :     /* Reset the process title */
  846 michael                  7319              28 :     update_checkpoint_display(flags, true, true);
                               7320                 : 
 4663 tgl                      7321              28 :     xtime = GetLatestXTime();
 5163 heikki.linnakangas       7322              28 :     ereport((log_checkpoints ? LOG : DEBUG2),
                               7323                 :             (errmsg("recovery restart point at %X/%X",
                               7324                 :                     LSN_FORMAT_ARGS(lastCheckPoint.redo)),
                               7325                 :              xtime ? errdetail("Last completed transaction was at log time %s.",
                               7326                 :                                timestamptz_to_str(xtime)) : 0));
 5163 heikki.linnakangas       7327 ECB             : 
 4770                          7328                 :     /*
 4686 itagaki.takahiro         7329                 :      * Finally, execute archive_cleanup_command, if any.
                               7330                 :      */
 1596 peter_e                  7331 GIC          28 :     if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
   62 michael                  7332 LBC           0 :         ExecuteRecoveryCommand(archiveCleanupCommand,
   62 michael                  7333 ECB             :                                "archive_cleanup_command",
                               7334                 :                                false,
                               7335                 :                                WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
                               7336                 : 
 5163 heikki.linnakangas       7337 GIC          28 :     return true;
                               7338                 : }
                               7339                 : 
                               7340                 : /*
                               7341                 :  * Report availability of WAL for the given target LSN
                               7342                 :  *      (typically a slot's restart_lsn)
 1097 alvherre                 7343 ECB             :  *
                               7344                 :  * Returns one of the following enum values:
                               7345                 :  *
 1019                          7346                 :  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
                               7347                 :  *   max_wal_size.
                               7348                 :  *
                               7349                 :  * * WALAVAIL_EXTENDED means it is still available by preserving extra
 1097                          7350                 :  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
                               7351                 :  *   than max_wal_size, this state is not returned.
                               7352                 :  *
                               7353                 :  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
                               7354                 :  *   remove reserved segments. The walsender using this slot may return to the
                               7355                 :  *   above.
                               7356                 :  *
                               7357                 :  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
                               7358                 :  *   a slot with this LSN cannot continue.  (Any associated walsender
                               7359                 :  *   processes should have been terminated already.)
                               7360                 :  *
                               7361                 :  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
                               7362                 :  */
                               7363                 : WALAvailability
 1097 alvherre                 7364 GIC         305 : GetWALAvailability(XLogRecPtr targetLSN)
                               7365                 : {
                               7366                 :     XLogRecPtr  currpos;        /* current write LSN */
 1097 alvherre                 7367 ECB             :     XLogSegNo   currSeg;        /* segid of currpos */
                               7368                 :     XLogSegNo   targetSeg;      /* segid of targetLSN */
                               7369                 :     XLogSegNo   oldestSeg;      /* actual oldest segid */
                               7370                 :     XLogSegNo   oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
                               7371                 :     XLogSegNo   oldestSlotSeg;  /* oldest segid kept by slot */
                               7372                 :     uint64      keepSegs;
                               7373                 : 
                               7374                 :     /*
 1019                          7375                 :      * slot does not reserve WAL. Either deactivated, or has never been active
                               7376                 :      */
 1097 alvherre                 7377 GIC         305 :     if (XLogRecPtrIsInvalid(targetLSN))
                               7378              11 :         return WALAVAIL_INVALID_LSN;
                               7379                 : 
                               7380                 :     /*
                               7381                 :      * Calculate the oldest segment currently reserved by all slots,
                               7382                 :      * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
                               7383                 :      * oldestSlotSeg to the current segment.
                               7384                 :      */
 1000                          7385             294 :     currpos = GetXLogWriteRecPtr();
                               7386             294 :     XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
 1097                          7387             294 :     KeepLogSeg(currpos, &oldestSlotSeg);
                               7388                 : 
                               7389                 :     /*
                               7390                 :      * Find the oldest extant segment file. We get 1 until checkpoint removes
 1097 alvherre                 7391 ECB             :      * the first WAL segment file since startup, which causes the status being
                               7392                 :      * wrong under certain abnormal conditions but that doesn't actually harm.
                               7393                 :      */
 1097 alvherre                 7394 GIC         294 :     oldestSeg = XLogGetLastRemovedSegno() + 1;
 1097 alvherre                 7395 ECB             : 
 1019                          7396                 :     /* calculate oldest segment by max_wal_size */
 1097 alvherre                 7397 GIC         294 :     XLByteToSeg(currpos, currSeg, wal_segment_size);
 1019                          7398             294 :     keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
 1097 alvherre                 7399 ECB             : 
 1097 alvherre                 7400 CBC         294 :     if (currSeg > keepSegs)
 1097 alvherre                 7401 GIC           8 :         oldestSegMaxWalSize = currSeg - keepSegs;
 1097 alvherre                 7402 ECB             :     else
 1097 alvherre                 7403 CBC         286 :         oldestSegMaxWalSize = 1;
                               7404                 : 
 1000 alvherre                 7405 ECB             :     /* the segment we care about */
 1000 alvherre                 7406 GIC         294 :     XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
 1000 alvherre                 7407 ECB             : 
                               7408                 :     /*
                               7409                 :      * No point in returning reserved or extended status values if the
                               7410                 :      * targetSeg is known to be lost.
                               7411                 :      */
 1019 alvherre                 7412 GIC         294 :     if (targetSeg >= oldestSlotSeg)
 1097 alvherre                 7413 ECB             :     {
 1019                          7414                 :         /* show "reserved" when targetSeg is within max_wal_size */
 1019 alvherre                 7415 GIC         293 :         if (targetSeg >= oldestSegMaxWalSize)
 1097                          7416             291 :             return WALAVAIL_RESERVED;
                               7417                 : 
                               7418                 :         /* being retained by slots exceeding max_wal_size */
 1019                          7419               2 :         return WALAVAIL_EXTENDED;
 1097 alvherre                 7420 ECB             :     }
                               7421                 : 
                               7422                 :     /* WAL segments are no longer retained but haven't been removed yet */
 1019 alvherre                 7423 GIC           1 :     if (targetSeg >= oldestSeg)
                               7424               1 :         return WALAVAIL_UNRESERVED;
                               7425                 : 
 1097 alvherre                 7426 ECB             :     /* Definitely lost */
 1097 alvherre                 7427 LBC           0 :     return WALAVAIL_REMOVED;
 1097 alvherre                 7428 ECB             : }
                               7429                 : 
                               7430                 : 
                               7431                 : /*
                               7432                 :  * Retreat *logSegNo to the last segment that we need to retain because of
                               7433                 :  * either wal_keep_size or replication slots.
                               7434                 :  *
                               7435                 :  * This is calculated by subtracting wal_keep_size from the given xlog
                               7436                 :  * location, recptr and by making sure that that result is below the
                               7437                 :  * requirement of replication slots.  For the latter criterion we do consider
 1097 alvherre                 7438 EUB             :  * the effects of max_slot_wal_keep_size: reserve at most that much space back
                               7439                 :  * from recptr.
                               7440                 :  *
  632 alvherre                 7441 ECB             :  * Note about replication slots: if this function calculates a value
                               7442                 :  * that's further ahead than what slots need reserved, then affected
                               7443                 :  * slots need to be invalidated and this function invoked again.
                               7444                 :  * XXX it might be a good idea to rewrite this function so that
                               7445                 :  * invalidation is optionally done here, instead.
                               7446                 :  */
                               7447                 : static void
 3941 heikki.linnakangas       7448 GIC        2660 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
                               7449                 : {
                               7450                 :     XLogSegNo   currSegNo;
                               7451                 :     XLogSegNo   segno;
                               7452                 :     XLogRecPtr  keep;
                               7453                 : 
 1097 alvherre                 7454            2660 :     XLByteToSeg(recptr, currSegNo, wal_segment_size);
 1097 alvherre                 7455 CBC        2660 :     segno = currSegNo;
 4282 simon                    7456 EUB             : 
                               7457                 :     /*
 1097 alvherre                 7458 ECB             :      * Calculate how many segments are kept by slots first, adjusting for
                               7459                 :      * max_slot_wal_keep_size.
                               7460                 :      */
 1097 alvherre                 7461 GIC        2660 :     keep = XLogGetReplicationSlotMinimumLSN();
                               7462            2660 :     if (keep != InvalidXLogRecPtr)
                               7463                 :     {
 1097 alvherre                 7464 CBC         384 :         XLByteToSeg(keep, segno, wal_segment_size);
                               7465                 : 
                               7466                 :         /* Cap by max_slot_wal_keep_size ... */
 1097 alvherre                 7467 GIC         384 :         if (max_slot_wal_keep_size_mb >= 0)
                               7468                 :         {
                               7469                 :             uint64      slot_keep_segs;
                               7470                 : 
                               7471              17 :             slot_keep_segs =
                               7472              17 :                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
 3355 rhaas                    7473 ECB             : 
 1097 alvherre                 7474 CBC          17 :             if (currSegNo - segno > slot_keep_segs)
 1097 alvherre                 7475 GIC           4 :                 segno = currSegNo - slot_keep_segs;
                               7476                 :         }
 1097 alvherre                 7477 ECB             :     }
                               7478                 : 
                               7479                 :     /* but, keep at least wal_keep_size if that's set */
  993 fujii                    7480 CBC        2660 :     if (wal_keep_size_mb > 0)
                               7481                 :     {
  993 fujii                    7482 ECB             :         uint64      keep_segs;
                               7483                 : 
  993 fujii                    7484 GIC          59 :         keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
                               7485              59 :         if (currSegNo - segno < keep_segs)
                               7486                 :         {
                               7487                 :             /* avoid underflow, don't go below 1 */
                               7488              59 :             if (currSegNo <= keep_segs)
                               7489              57 :                 segno = 1;
                               7490                 :             else
                               7491               2 :                 segno = currSegNo - keep_segs;
  993 fujii                    7492 ECB             :         }
 3355 rhaas                    7493 EUB             :     }
                               7494                 : 
                               7495                 :     /* don't delete WAL segments newer than the calculated segment */
 1000 alvherre                 7496 GIC        2660 :     if (segno < *logSegNo)
 3941 heikki.linnakangas       7497             109 :         *logSegNo = segno;
 4282 simon                    7498 CBC        2660 : }
                               7499                 : 
                               7500                 : /*
                               7501                 :  * Write a NEXTOID log record
                               7502                 :  */
                               7503                 : void
 8192 vadim4o                  7504 GIC        1248 : XLogPutNextOid(Oid nextOid)
                               7505                 : {
 3062 heikki.linnakangas       7506            1248 :     XLogBeginInsert();
                               7507            1248 :     XLogRegisterData((char *) (&nextOid), sizeof(Oid));
                               7508            1248 :     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
                               7509                 : 
                               7510                 :     /*
                               7511                 :      * We need not flush the NEXTOID record immediately, because any of the
                               7512                 :      * just-allocated OIDs could only reach disk as part of a tuple insert or
                               7513                 :      * update that would have its own XLOG record that must follow the NEXTOID
                               7514                 :      * record.  Therefore, the standard buffer LSN interlock applied to those
                               7515                 :      * records will ensure no such OID reaches disk before the NEXTOID record
                               7516                 :      * does.
                               7517                 :      *
                               7518                 :      * Note, however, that the above statement only covers state "within" the
                               7519                 :      * database.  When we use a generated OID as a file or directory name, we
                               7520                 :      * are in a sense violating the basic WAL rule, because that filesystem
                               7521                 :      * change may reach disk before the NEXTOID WAL record does.  The impact
                               7522                 :      * of this is that if a database crash occurs immediately afterward, we
                               7523                 :      * might after restart re-generate the same OID and find that it conflicts
                               7524                 :      * with the leftover file or directory.  But since for safety's sake we
 5624 bruce                    7525 ECB             :      * always loop until finding a nonconflicting filename, this poses no real
                               7526                 :      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
                               7527                 :      */
 6555 tgl                      7528 GIC        1248 : }
                               7529                 : 
                               7530                 : /*
                               7531                 :  * Write an XLOG SWITCH record.
                               7532                 :  *
                               7533                 :  * Here we just blindly issue an XLogInsert request for the record.
                               7534                 :  * All the magic happens inside XLogInsert.
                               7535                 :  *
                               7536                 :  * The return value is either the end+1 address of the switch record,
                               7537                 :  * or the end+1 address of the prior segment if we did not need to
 6090 tgl                      7538 ECB             :  * write a switch record because we are already at segment start.
                               7539                 :  */
                               7540                 : XLogRecPtr
 2299 andres                   7541 GIC         300 : RequestXLogSwitch(bool mark_unimportant)
                               7542                 : {
                               7543                 :     XLogRecPtr  RecPtr;
                               7544                 : 
                               7545                 :     /* XLOG SWITCH has no data */
 3062 heikki.linnakangas       7546 CBC         300 :     XLogBeginInsert();
 2299 andres                   7547 ECB             : 
 2299 andres                   7548 CBC         300 :     if (mark_unimportant)
 2299 andres                   7549 UIC           0 :         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
 3062 heikki.linnakangas       7550 GIC         300 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
                               7551                 : 
 6090 tgl                      7552             300 :     return RecPtr;
                               7553                 : }
                               7554                 : 
 4443 simon                    7555 ECB             : /*
                               7556                 :  * Write a RESTORE POINT record
                               7557                 :  */
                               7558                 : XLogRecPtr
 4443 simon                    7559 CBC           3 : XLogRestorePoint(const char *rpName)
                               7560                 : {
 4382 bruce                    7561 ECB             :     XLogRecPtr  RecPtr;
                               7562                 :     xl_restore_point xlrec;
                               7563                 : 
 4443 simon                    7564 CBC           3 :     xlrec.rp_time = GetCurrentTimestamp();
 3338 tgl                      7565 GIC           3 :     strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
                               7566                 : 
 3062 heikki.linnakangas       7567 CBC           3 :     XLogBeginInsert();
 3062 heikki.linnakangas       7568 GIC           3 :     XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
                               7569                 : 
                               7570               3 :     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
                               7571                 : 
 4427 rhaas                    7572               3 :     ereport(LOG,
 4427 rhaas                    7573 ECB             :             (errmsg("restore point \"%s\" created at %X/%X",
                               7574                 :                     rpName, LSN_FORMAT_ARGS(RecPtr))));
                               7575                 : 
 4443 simon                    7576 CBC           3 :     return RecPtr;
 4443 simon                    7577 ECB             : }
                               7578                 : 
                               7579                 : /*
 4729 heikki.linnakangas       7580                 :  * Check if any of the GUC parameters that are critical for hot standby
                               7581                 :  * have changed, and update the value in pg_control file if necessary.
                               7582                 :  */
                               7583                 : static void
 4729 heikki.linnakangas       7584 CBC        1142 : XLogReportParameters(void)
 4827 heikki.linnakangas       7585 ECB             : {
 4729 heikki.linnakangas       7586 GIC        1142 :     if (wal_level != ControlFile->wal_level ||
 3385 rhaas                    7587             982 :         wal_log_hints != ControlFile->wal_log_hints ||
 4729 heikki.linnakangas       7588 GBC         923 :         MaxConnections != ControlFile->MaxConnections ||
 3566 rhaas                    7589 GIC         922 :         max_worker_processes != ControlFile->max_worker_processes ||
 1517 michael                  7590             922 :         max_wal_senders != ControlFile->max_wal_senders ||
 4729 heikki.linnakangas       7591             918 :         max_prepared_xacts != ControlFile->max_prepared_xacts ||
 3049 alvherre                 7592             838 :         max_locks_per_xact != ControlFile->max_locks_per_xact ||
                               7593             838 :         track_commit_timestamp != ControlFile->track_commit_timestamp)
                               7594                 :     {
                               7595                 :         /*
                               7596                 :          * The change in number of backend slots doesn't need to be WAL-logged
                               7597                 :          * if archiving is not enabled, as you can't start archive recovery
                               7598                 :          * with wal_level=minimal anyway. We don't really care about the
                               7599                 :          * values in pg_control either if wal_level=minimal, but seems better
                               7600                 :          * to keep them up-to-date to avoid confusion.
                               7601                 :          */
 4729 heikki.linnakangas       7602             311 :         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
                               7603                 :         {
                               7604                 :             xl_parameter_change xlrec;
                               7605                 :             XLogRecPtr  recptr;
                               7606                 : 
                               7607             309 :             xlrec.MaxConnections = MaxConnections;
 3566 rhaas                    7608             309 :             xlrec.max_worker_processes = max_worker_processes;
 1517 michael                  7609 CBC         309 :             xlrec.max_wal_senders = max_wal_senders;
 4729 heikki.linnakangas       7610 GIC         309 :             xlrec.max_prepared_xacts = max_prepared_xacts;
                               7611             309 :             xlrec.max_locks_per_xact = max_locks_per_xact;
                               7612             309 :             xlrec.wal_level = wal_level;
 3385 rhaas                    7613             309 :             xlrec.wal_log_hints = wal_log_hints;
 3049 alvherre                 7614             309 :             xlrec.track_commit_timestamp = track_commit_timestamp;
 4729 heikki.linnakangas       7615 ECB             : 
 3062 heikki.linnakangas       7616 CBC         309 :             XLogBeginInsert();
 3062 heikki.linnakangas       7617 GIC         309 :             XLogRegisterData((char *) &xlrec, sizeof(xlrec));
                               7618                 : 
                               7619             309 :             recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
 3301 fujii                    7620             309 :             XLogFlush(recptr);
                               7621                 :         }
 4827 heikki.linnakangas       7622 ECB             : 
 1035 tmunro                   7623 CBC         311 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                               7624                 : 
 4729 heikki.linnakangas       7625             311 :         ControlFile->MaxConnections = MaxConnections;
 3566 rhaas                    7626 GIC         311 :         ControlFile->max_worker_processes = max_worker_processes;
 1517 michael                  7627             311 :         ControlFile->max_wal_senders = max_wal_senders;
 4729 heikki.linnakangas       7628 CBC         311 :         ControlFile->max_prepared_xacts = max_prepared_xacts;
 4729 heikki.linnakangas       7629 GIC         311 :         ControlFile->max_locks_per_xact = max_locks_per_xact;
                               7630             311 :         ControlFile->wal_level = wal_level;
 3385 rhaas                    7631             311 :         ControlFile->wal_log_hints = wal_log_hints;
 3049 alvherre                 7632 CBC         311 :         ControlFile->track_commit_timestamp = track_commit_timestamp;
 4729 heikki.linnakangas       7633             311 :         UpdateControlFile();
                               7634                 : 
 1035 tmunro                   7635             311 :         LWLockRelease(ControlFileLock);
 4729 heikki.linnakangas       7636 ECB             :     }
 4827 heikki.linnakangas       7637 GIC        1142 : }
                               7638                 : 
                               7639                 : /*
                               7640                 :  * Update full_page_writes in shared memory, and write an
 4092 simon                    7641 ECB             :  * XLOG_FPW_CHANGE record if necessary.
                               7642                 :  *
                               7643                 :  * Note: this function assumes there is no other process running
                               7644                 :  * concurrently that could update it.
                               7645                 :  */
                               7646                 : void
 4092 simon                    7647 GIC        1533 : UpdateFullPageWrites(void)
                               7648                 : {
 4092 simon                    7649 CBC        1533 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
 1654 akapila                  7650 ECB             :     bool        recoveryInProgress;
                               7651                 : 
 4092 simon                    7652                 :     /*
                               7653                 :      * Do nothing if full_page_writes has not been changed.
                               7654                 :      *
                               7655                 :      * It's safe to check the shared full_page_writes without the lock,
                               7656                 :      * because we assume that there is no concurrently running process which
 3955 bruce                    7657                 :      * can update it.
 4092 simon                    7658                 :      */
 4092 simon                    7659 CBC        1533 :     if (fullPageWrites == Insert->fullPageWrites)
 4092 simon                    7660 GIC        1264 :         return;
                               7661                 : 
                               7662                 :     /*
                               7663                 :      * Perform this outside critical section so that the WAL insert
                               7664                 :      * initialization done by RecoveryInProgress() doesn't trigger an
 1654 akapila                  7665 ECB             :      * assertion failure.
                               7666                 :      */
 1654 akapila                  7667 CBC         269 :     recoveryInProgress = RecoveryInProgress();
 1654 akapila                  7668 ECB             : 
 4051 heikki.linnakangas       7669 CBC         269 :     START_CRIT_SECTION();
                               7670                 : 
                               7671                 :     /*
                               7672                 :      * It's always safe to take full page images, even when not strictly
                               7673                 :      * required, but not the other round. So if we're setting full_page_writes
                               7674                 :      * to true, first set it true and then write the WAL record. If we're
                               7675                 :      * setting it to false, first write the WAL record and then set the global
                               7676                 :      * flag.
                               7677                 :      */
 4051 heikki.linnakangas       7678 GIC         269 :     if (fullPageWrites)
                               7679                 :     {
 3306                          7680             267 :         WALInsertLockAcquireExclusive();
 4051                          7681             267 :         Insert->fullPageWrites = true;
 3306                          7682             267 :         WALInsertLockRelease();
                               7683                 :     }
                               7684                 : 
                               7685                 :     /*
                               7686                 :      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
                               7687                 :      * full_page_writes during archive recovery, if required.
                               7688                 :      */
 1654 akapila                  7689 CBC         269 :     if (XLogStandbyInfoActive() && !recoveryInProgress)
                               7690                 :     {
 3062 heikki.linnakangas       7691 UIC           0 :         XLogBeginInsert();
                               7692               0 :         XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
                               7693                 : 
                               7694               0 :         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
                               7695                 :     }
                               7696                 : 
 4051 heikki.linnakangas       7697 GIC         269 :     if (!fullPageWrites)
                               7698                 :     {
 3306                          7699               2 :         WALInsertLockAcquireExclusive();
 4051                          7700               2 :         Insert->fullPageWrites = false;
 3306                          7701               2 :         WALInsertLockRelease();
 4092 simon                    7702 ECB             :     }
 4051 heikki.linnakangas       7703 GIC         269 :     END_CRIT_SECTION();
                               7704                 : }
                               7705                 : 
                               7706                 : /*
 8062 tgl                      7707 ECB             :  * XLOG resource manager's routines
                               7708                 :  *
 5163 heikki.linnakangas       7709                 :  * Definitions of info values are in include/catalog/pg_control.h, though
 5035 tgl                      7710 EUB             :  * not all record types are related to control file updates.
  417 heikki.linnakangas       7711 ECB             :  *
                               7712                 :  * NOTE: Some XLOG record types that are directly related to WAL recovery
                               7713                 :  * are handled in xlogrecovery_redo().
                               7714                 :  */
                               7715                 : void
 3062 heikki.linnakangas       7716 GIC       28161 : xlog_redo(XLogReaderState *record)
                               7717                 : {
                               7718           28161 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
                               7719           28161 :     XLogRecPtr  lsn = record->EndRecPtr;
 8192 vadim4o                  7720 ECB             : 
                               7721                 :     /*
                               7722                 :      * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
                               7723                 :      * XLOG_FPI_FOR_HINT records.
                               7724                 :      */
 3058 heikki.linnakangas       7725 CBC       28161 :     Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
 3058 heikki.linnakangas       7726 ECB             :            !XLogRecHasAnyBlockRefs(record));
                               7727                 : 
 8057 tgl                      7728 CBC       28161 :     if (info == XLOG_NEXTOID)
 8192 vadim4o                  7729 ECB             :     {
                               7730                 :         Oid         nextOid;
                               7731                 : 
                               7732                 :         /*
 4080 tgl                      7733                 :          * We used to try to take the maximum of ShmemVariableCache->nextOid
                               7734                 :          * and the recorded nextOid, but that fails if the OID counter wraps
                               7735                 :          * around.  Since no OID allocation should be happening during replay
                               7736                 :          * anyway, better to just believe the record exactly.  We still take
                               7737                 :          * OidGenLock while setting the variable, just in case.
                               7738                 :          */
 8192 vadim4o                  7739 GIC          72 :         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
 4080 tgl                      7740              72 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
                               7741              72 :         ShmemVariableCache->nextOid = nextOid;
                               7742              72 :         ShmemVariableCache->oidCount = 0;
                               7743              72 :         LWLockRelease(OidGenLock);
                               7744                 :     }
 8062 tgl                      7745 CBC       28089 :     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
                               7746                 :     {
 8062 tgl                      7747 ECB             :         CheckPoint  checkPoint;
  417 heikki.linnakangas       7748                 :         TimeLineID  replayTLI;
 8062 tgl                      7749                 : 
 8062 tgl                      7750 CBC          25 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 8062 tgl                      7751 ECB             :         /* In a SHUTDOWN checkpoint, believe the counters exactly */
 4080 tgl                      7752 CBC          25 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
  971 andres                   7753              25 :         ShmemVariableCache->nextXid = checkPoint.nextXid;
 4080 tgl                      7754              25 :         LWLockRelease(XidGenLock);
 4080 tgl                      7755 GIC          25 :         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
 8062                          7756              25 :         ShmemVariableCache->nextOid = checkPoint.nextOid;
                               7757              25 :         ShmemVariableCache->oidCount = 0;
 4080                          7758              25 :         LWLockRelease(OidGenLock);
 6514                          7759              25 :         MultiXactSetNextMXact(checkPoint.nextMulti,
                               7760                 :                               checkPoint.nextMultiOffset);
                               7761                 : 
 2752 andres                   7762              25 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
 2752 andres                   7763 ECB             :                                checkPoint.oldestMultiDB);
                               7764                 : 
                               7765                 :         /*
                               7766                 :          * No need to set oldestClogXid here as well; it'll be set when we
                               7767                 :          * redo an xl_clog_truncate if it changed since initialization.
 2208 rhaas                    7768                 :          */
 4799 tgl                      7769 CBC          25 :         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 6797 bruce                    7770 ECB             : 
 4730 heikki.linnakangas       7771                 :         /*
 4660 bruce                    7772                 :          * If we see a shutdown checkpoint while waiting for an end-of-backup
 4302 peter_e                  7773                 :          * record, the backup was canceled and the end-of-backup record will
 4660 bruce                    7774                 :          * never arrive.
 4730 heikki.linnakangas       7775                 :          */
 3698 heikki.linnakangas       7776 GIC          25 :         if (ArchiveRecoveryRequested &&
 4092 simon                    7777 CBC          25 :             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
 4092 simon                    7778 LBC           0 :             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
 4080 tgl                      7779 UIC           0 :             ereport(PANIC,
 2118 tgl                      7780 ECB             :                     (errmsg("online backup was canceled, recovery cannot continue")));
 4730 heikki.linnakangas       7781                 : 
                               7782                 :         /*
                               7783                 :          * If we see a shutdown checkpoint, we know that nothing was running
 1029 andres                   7784                 :          * on the primary at this point. So fake-up an empty running-xacts
                               7785                 :          * record and use that here and now. Recover additional standby state
 4660 bruce                    7786                 :          * for prepared transactions.
 4744 heikki.linnakangas       7787                 :          */
 4859 simon                    7788 CBC          25 :         if (standbyState >= STANDBY_INITIALIZED)
 4859 simon                    7789 ECB             :         {
 4744 heikki.linnakangas       7790                 :             TransactionId *xids;
                               7791                 :             int         nxids;
                               7792                 :             TransactionId oldestActiveXID;
 4714 simon                    7793                 :             TransactionId latestCompletedXid;
 4744 heikki.linnakangas       7794                 :             RunningTransactionsData running;
                               7795                 : 
 4744 heikki.linnakangas       7796 CBC          23 :             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
                               7797                 : 
 4859 simon                    7798 ECB             :             /*
                               7799                 :              * Construct a RunningTransactions snapshot representing a shut
                               7800                 :              * down server, with only prepared transactions still alive. We're
                               7801                 :              * never overflowed at this point because all subxids are listed
                               7802                 :              * with their parent prepared transactions.
                               7803                 :              */
 4744 heikki.linnakangas       7804 GIC          23 :             running.xcnt = nxids;
 3780 simon                    7805              23 :             running.subxcnt = 0;
 4744 heikki.linnakangas       7806              23 :             running.subxid_overflow = false;
  971 andres                   7807              23 :             running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 4744 heikki.linnakangas       7808 CBC          23 :             running.oldestRunningXid = oldestActiveXID;
  971 andres                   7809 GIC          23 :             latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
 4714 simon                    7810 CBC          23 :             TransactionIdRetreat(latestCompletedXid);
 4713 simon                    7811 GIC          23 :             Assert(TransactionIdIsNormal(latestCompletedXid));
 4714                          7812              23 :             running.latestCompletedXid = latestCompletedXid;
 4744 heikki.linnakangas       7813              23 :             running.xids = xids;
                               7814                 : 
                               7815              23 :             ProcArrayApplyRecoveryInfo(&running);
                               7816                 : 
 2173 simon                    7817              23 :             StandbyRecoverPreparedTransactions();
                               7818                 :         }
                               7819                 : 
 6075 tgl                      7820 ECB             :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 1035 tmunro                   7821 CBC          25 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  971 andres                   7822 GIC          25 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 1035 tmunro                   7823              25 :         LWLockRelease(ControlFileLock);
                               7824                 : 
                               7825                 :         /* Update shared-memory copy of checkpoint XID/epoch */
 3121 andres                   7826              25 :         SpinLockAcquire(&XLogCtl->info_lck);
  971                          7827              25 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
 3121 andres                   7828 CBC          25 :         SpinLockRelease(&XLogCtl->info_lck);
                               7829                 : 
 6836 tgl                      7830 ECB             :         /*
                               7831                 :          * We should've already switched to the new TLI before replaying this
                               7832                 :          * record.
                               7833                 :          */
  417 heikki.linnakangas       7834 GIC          25 :         (void) GetCurrentReplayRecPtr(&replayTLI);
  520 rhaas                    7835              25 :         if (checkPoint.ThisTimeLineID != replayTLI)
 3775 heikki.linnakangas       7836 UIC           0 :             ereport(PANIC,
                               7837                 :                     (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
                               7838                 :                             checkPoint.ThisTimeLineID, replayTLI)));
 6089 tgl                      7839 ECB             : 
  501 rhaas                    7840 GIC          25 :         RecoveryRestartPoint(&checkPoint, record);
 8062 tgl                      7841 ECB             :     }
 8062 tgl                      7842 CBC       28064 :     else if (info == XLOG_CHECKPOINT_ONLINE)
 8062 tgl                      7843 ECB             :     {
                               7844                 :         CheckPoint  checkPoint;
                               7845                 :         TimeLineID  replayTLI;
                               7846                 : 
 8062 tgl                      7847 GIC         141 :         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
                               7848                 :         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
 4080                          7849             141 :         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
  971 andres                   7850 CBC         141 :         if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
                               7851                 :                                       checkPoint.nextXid))
  971 andres                   7852 UBC           0 :             ShmemVariableCache->nextXid = checkPoint.nextXid;
 4080 tgl                      7853 GBC         141 :         LWLockRelease(XidGenLock);
                               7854                 : 
 1824 tgl                      7855 EUB             :         /*
                               7856                 :          * We ignore the nextOid counter in an ONLINE checkpoint, preferring
                               7857                 :          * to track OID assignment through XLOG_NEXTOID records.  The nextOid
 1824 tgl                      7858 ECB             :          * counter is from the start of the checkpoint and might well be stale
                               7859                 :          * compared to later XLOG_NEXTOID records.  We could try to take the
                               7860                 :          * maximum of the nextOid counter and our latest value, but since
                               7861                 :          * there's no particular guarantee about the speed with which the OID
                               7862                 :          * counter wraps around, that's a risky thing to do.  In any case,
                               7863                 :          * users of the nextOid counter are required to avoid assignment of
                               7864                 :          * duplicates, so that a somewhat out-of-date value should be safe.
                               7865                 :          */
                               7866                 : 
                               7867                 :         /* Handle multixact */
 6514 tgl                      7868 GIC         141 :         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
                               7869                 :                                   checkPoint.nextMultiOffset);
                               7870                 : 
                               7871                 :         /*
                               7872                 :          * NB: This may perform multixact truncation when replaying WAL
                               7873                 :          * generated by an older primary.
                               7874                 :          */
 2752 andres                   7875             141 :         MultiXactAdvanceOldest(checkPoint.oldestMulti,
                               7876                 :                                checkPoint.oldestMultiDB);
 4969 tgl                      7877 CBC         141 :         if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
                               7878                 :                                   checkPoint.oldestXid))
 4799 tgl                      7879 LBC           0 :             SetTransactionIdLimit(checkPoint.oldestXid,
 4799 tgl                      7880 ECB             :                                   checkPoint.oldestXidDB);
                               7881                 :         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 1035 tmunro                   7882 GIC         141 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  971 andres                   7883             141 :         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 1035 tmunro                   7884             141 :         LWLockRelease(ControlFileLock);
                               7885                 : 
 4028 simon                    7886 ECB             :         /* Update shared-memory copy of checkpoint XID/epoch */
 3121 andres                   7887 GIC         141 :         SpinLockAcquire(&XLogCtl->info_lck);
  971                          7888             141 :         XLogCtl->ckptFullXid = checkPoint.nextXid;
 3121 andres                   7889 CBC         141 :         SpinLockRelease(&XLogCtl->info_lck);
                               7890                 : 
                               7891                 :         /* TLI should not change in an on-line checkpoint */
  417 heikki.linnakangas       7892 GIC         141 :         (void) GetCurrentReplayRecPtr(&replayTLI);
  520 rhaas                    7893             141 :         if (checkPoint.ThisTimeLineID != replayTLI)
 6997 tgl                      7894 UIC           0 :             ereport(PANIC,
                               7895                 :                     (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
                               7896                 :                             checkPoint.ThisTimeLineID, replayTLI)));
                               7897                 : 
  501 rhaas                    7898 GIC         141 :         RecoveryRestartPoint(&checkPoint, record);
                               7899                 :     }
  557 alvherre                 7900 CBC       27923 :     else if (info == XLOG_OVERWRITE_CONTRECORD)
  557 alvherre                 7901 ECB             :     {
  417 heikki.linnakangas       7902                 :         /* nothing to do here, handled in xlogrecovery_redo() */
  557 alvherre                 7903                 :     }
 3722 simon                    7904 CBC       27922 :     else if (info == XLOG_END_OF_RECOVERY)
                               7905                 :     {
 3722 simon                    7906 ECB             :         xl_end_of_recovery xlrec;
                               7907                 :         TimeLineID  replayTLI;
                               7908                 : 
 3722 simon                    7909 GIC           8 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
                               7910                 : 
 3722 simon                    7911 ECB             :         /*
                               7912                 :          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
                               7913                 :          * but this case is rarer and harder to test, so the benefit doesn't
                               7914                 :          * outweigh the potential extra cost of maintenance.
                               7915                 :          */
                               7916                 : 
                               7917                 :         /*
                               7918                 :          * We should've already switched to the new TLI before replaying this
                               7919                 :          * record.
                               7920                 :          */
  417 heikki.linnakangas       7921 GIC           8 :         (void) GetCurrentReplayRecPtr(&replayTLI);
  520 rhaas                    7922               8 :         if (xlrec.ThisTimeLineID != replayTLI)
 3722 simon                    7923 LBC           0 :             ereport(PANIC,
                               7924                 :                     (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
                               7925                 :                             xlrec.ThisTimeLineID, replayTLI)));
                               7926                 :     }
 5803 tgl                      7927 GIC       27914 :     else if (info == XLOG_NOOP)
                               7928                 :     {
                               7929                 :         /* nothing to do here */
 5803 tgl                      7930 ECB             :     }
 6090 tgl                      7931 GIC       27914 :     else if (info == XLOG_SWITCH)
                               7932                 :     {
                               7933                 :         /* nothing to do here */
                               7934                 :     }
 4443 simon                    7935           27820 :     else if (info == XLOG_RESTORE_POINT)
                               7936                 :     {
  417 heikki.linnakangas       7937 ECB             :         /* nothing to do here, handled in xlogrecovery.c */
 4443 simon                    7938                 :     }
 3058 heikki.linnakangas       7939 GBC       27815 :     else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
 3670 simon                    7940 EUB             :     {
                               7941                 :         /*
                               7942                 :          * XLOG_FPI records contain nothing else but one or more block
                               7943                 :          * references. Every block reference must include a full-page image
                               7944                 :          * even if full_page_writes was disabled when the record was generated
                               7945                 :          * - otherwise there would be no point in this record.
                               7946                 :          *
                               7947                 :          * XLOG_FPI_FOR_HINT records are generated when a page needs to be
                               7948                 :          * WAL-logged because of a hint bit update. They are only generated
  627 fujii                    7949 ECB             :          * when checksums and/or wal_log_hints are enabled. They may include
                               7950                 :          * no full-page images if full_page_writes was disabled when they were
                               7951                 :          * generated. In this case there is nothing to do here.
                               7952                 :          *
                               7953                 :          * No recovery conflicts are generated by these generic records - if a
                               7954                 :          * resource manager needs to generate conflicts, it has to define a
                               7955                 :          * separate WAL record type and redo routine.
                               7956                 :          */
  387 tmunro                   7957 CBC       57791 :         for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
                               7958                 :         {
                               7959                 :             Buffer      buffer;
                               7960                 : 
  627 fujii                    7961 GIC       30056 :             if (!XLogRecHasBlockImage(record, block_id))
                               7962                 :             {
                               7963              81 :                 if (info == XLOG_FPI)
  627 fujii                    7964 UIC           0 :                     elog(ERROR, "XLOG_FPI record did not contain a full-page image");
  627 fujii                    7965 CBC          81 :                 continue;
  627 fujii                    7966 ECB             :             }
                               7967                 : 
 1467 heikki.linnakangas       7968 CBC       29975 :             if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
 1467 heikki.linnakangas       7969 LBC           0 :                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
 1467 heikki.linnakangas       7970 CBC       29975 :             UnlockReleaseBuffer(buffer);
 1467 heikki.linnakangas       7971 ECB             :         }
 3670 simon                    7972                 :     }
 4843 heikki.linnakangas       7973 CBC          80 :     else if (info == XLOG_BACKUP_END)
 4843 heikki.linnakangas       7974 ECB             :     {
                               7975                 :         /* nothing to do here, handled in xlogrecovery_redo() */
                               7976                 :     }
 4729 heikki.linnakangas       7977 GIC          19 :     else if (info == XLOG_PARAMETER_CHANGE)
 4827 heikki.linnakangas       7978 ECB             :     {
                               7979                 :         xl_parameter_change xlrec;
                               7980                 : 
                               7981                 :         /* Update our copy of the parameters in pg_control */
 4729 heikki.linnakangas       7982 CBC          19 :         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
 4729 heikki.linnakangas       7983 ECB             : 
                               7984                 :         /*
                               7985                 :          * Invalidate logical slots if we are in hot standby and the primary
                               7986                 :          * does not have a WAL level sufficient for logical decoding. No need
                               7987                 :          * to search for potentially conflicting logically slots if standby is
                               7988                 :          * running with wal_level lower than logical, because in that case, we
                               7989                 :          * would have either disallowed creation of logical slots or
                               7990                 :          * invalidated existing ones.
                               7991                 :          */
    2 andres                   7992 GNC          19 :         if (InRecovery && InHotStandby &&
                               7993               4 :             xlrec.wal_level < WAL_LEVEL_LOGICAL &&
                               7994               3 :             wal_level >= WAL_LEVEL_LOGICAL)
                               7995               1 :             InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
                               7996                 :                                                0, InvalidOid,
                               7997                 :                                                InvalidTransactionId);
                               7998                 : 
 4724 heikki.linnakangas       7999 CBC          19 :         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 4729 heikki.linnakangas       8000 GIC          19 :         ControlFile->MaxConnections = xlrec.MaxConnections;
 3566 rhaas                    8001              19 :         ControlFile->max_worker_processes = xlrec.max_worker_processes;
 1517 michael                  8002 CBC          19 :         ControlFile->max_wal_senders = xlrec.max_wal_senders;
 4729 heikki.linnakangas       8003              19 :         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
                               8004              19 :         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
 4729 heikki.linnakangas       8005 GIC          19 :         ControlFile->wal_level = xlrec.wal_level;
 3006                          8006              19 :         ControlFile->wal_log_hints = xlrec.wal_log_hints;
                               8007                 : 
                               8008                 :         /*
                               8009                 :          * Update minRecoveryPoint to ensure that if recovery is aborted, we
 4660 bruce                    8010 ECB             :          * recover back up to this point before allowing hot standby again.
 2596 peter_e                  8011                 :          * This is important if the max_* settings are decreased, to ensure
 1739 michael                  8012 EUB             :          * you don't run queries against the WAL preceding the change. The
                               8013                 :          * local copies cannot be updated as long as crash recovery is
                               8014                 :          * happening and we expect all the WAL to be replayed.
                               8015                 :          */
 1739 michael                  8016 CBC          19 :         if (InArchiveRecovery)
                               8017                 :         {
  417 heikki.linnakangas       8018               5 :             LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
  417 heikki.linnakangas       8019 GIC           5 :             LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
                               8020                 :         }
                               8021              19 :         if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
                               8022                 :         {
  417 heikki.linnakangas       8023 ECB             :             TimeLineID  replayTLI;
                               8024                 : 
  417 heikki.linnakangas       8025 CBC           5 :             (void) GetCurrentReplayRecPtr(&replayTLI);
 4724                          8026               5 :             ControlFile->minRecoveryPoint = lsn;
  520 rhaas                    8027 GIC           5 :             ControlFile->minRecoveryPointTLI = replayTLI;
 4724 heikki.linnakangas       8028 EUB             :         }
 4724 heikki.linnakangas       8029 ECB             : 
 2747 alvherre                 8030 GIC          19 :         CommitTsParameterChange(xlrec.track_commit_timestamp,
                               8031              19 :                                 ControlFile->track_commit_timestamp);
                               8032              19 :         ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
                               8033                 : 
 4729 heikki.linnakangas       8034              19 :         UpdateControlFile();
 4724                          8035              19 :         LWLockRelease(ControlFileLock);
                               8036                 : 
                               8037                 :         /* Check to see if any parameter change gives a problem on recovery */
 4729                          8038              19 :         CheckRequiredParameterValues();
                               8039                 :     }
 4092 simon                    8040 UIC           0 :     else if (info == XLOG_FPW_CHANGE)
                               8041                 :     {
                               8042                 :         bool        fpw;
                               8043                 : 
 4092 simon                    8044 LBC           0 :         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
                               8045                 : 
                               8046                 :         /*
                               8047                 :          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
                               8048                 :          * do_pg_backup_start() and do_pg_backup_stop() can check whether
                               8049                 :          * full_page_writes has been disabled during online backup.
                               8050                 :          */
                               8051               0 :         if (!fpw)
                               8052                 :         {
 3121 andres                   8053               0 :             SpinLockAcquire(&XLogCtl->info_lck);
  501 rhaas                    8054 UIC           0 :             if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
  501 rhaas                    8055 UBC           0 :                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
 3121 andres                   8056 UIC           0 :             SpinLockRelease(&XLogCtl->info_lck);
                               8057                 :         }
 4092 simon                    8058 ECB             : 
                               8059                 :         /* Keep track of full_page_writes */
 4092 simon                    8060 LBC           0 :         lastFullPageWrites = fpw;
                               8061                 :     }
 8205 vadim4o                  8062 GIC       28159 : }
 8053 bruce                    8063 ECB             : 
 8059 tgl                      8064                 : /*
                               8065                 :  * Return the extra open flags used for opening a file, depending on the
                               8066                 :  * value of the GUCs wal_sync_method, fsync and io_direct.
                               8067                 :  */
 5443 magnus                   8068                 : static int
 5443 magnus                   8069 CBC        8348 : get_sync_bit(int method)
 8059 tgl                      8070 EUB             : {
 4790 bruce                    8071 GIC        8348 :     int         o_direct_flag = 0;
                               8072                 : 
                               8073                 :     /*
                               8074                 :      * Use O_DIRECT if requested, except in walreceiver process.  The WAL
                               8075                 :      * written by walreceiver is normally read by the startup process soon
                               8076                 :      * after it's written.  Also, walreceiver performs unaligned writes, which
                               8077                 :      * don't work with O_DIRECT, so it is required for correctness too.
                               8078                 :      */
    1 tmunro                   8079 GNC        8348 :     if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
 4797 heikki.linnakangas       8080 GIC           7 :         o_direct_flag = PG_O_DIRECT;
                               8081                 : 
                               8082                 :     /* If fsync is disabled, never open in sync mode */
    1 tmunro                   8083 GNC        8348 :     if (!enableFsync)
                               8084            8348 :         return o_direct_flag;
                               8085                 : 
 5443 magnus                   8086 UIC           0 :     switch (method)
                               8087                 :     {
                               8088                 :             /*
 5050 bruce                    8089 ECB             :              * enum values for all sync options are defined even if they are
                               8090                 :              * not supported on the current platform.  But if not, they are
 5050 bruce                    8091 EUB             :              * not included in the enum option array, and therefore will never
                               8092                 :              * be seen here.
                               8093                 :              */
 5445 magnus                   8094 UIC           0 :         case SYNC_METHOD_FSYNC:
 5445 magnus                   8095 ECB             :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
                               8096                 :         case SYNC_METHOD_FDATASYNC:
    1 tmunro                   8097 UNC           0 :             return o_direct_flag;
                               8098                 : #ifdef O_SYNC
 5445 magnus                   8099 LBC           0 :         case SYNC_METHOD_OPEN:
  261 tmunro                   8100 UNC           0 :             return O_SYNC | o_direct_flag;
                               8101                 : #endif
                               8102                 : #ifdef O_DSYNC
 5445 magnus                   8103 LBC           0 :         case SYNC_METHOD_OPEN_DSYNC:
  261 tmunro                   8104 UNC           0 :             return O_DSYNC | o_direct_flag;
                               8105                 : #endif
 5445 magnus                   8106 UIC           0 :         default:
 5441 tgl                      8107 ECB             :             /* can't happen (unless we are out of sync with option array) */
 5441 tgl                      8108 UIC           0 :             elog(ERROR, "unrecognized wal_sync_method: %d", method);
                               8109                 :             return 0;           /* silence warning */
                               8110                 :     }
                               8111                 : }
                               8112                 : 
                               8113                 : /*
                               8114                 :  * GUC support
                               8115                 :  */
                               8116                 : void
 4385 tgl                      8117 GIC        1857 : assign_xlog_sync_method(int new_sync_method, void *extra)
                               8118                 : {
 5443 magnus                   8119            1857 :     if (sync_method != new_sync_method)
                               8120                 :     {
                               8121                 :         /*
                               8122                 :          * To ensure that no blocks escape unsynced, force an fsync on the
                               8123                 :          * currently open log segment (if any).  Also, if the open flag is
                               8124                 :          * changing, close the log file so it will be reopened (with new flag
 6385 bruce                    8125 ECB             :          * bit) at next use.
                               8126                 :          */
 8059 tgl                      8127 UIC           0 :         if (openLogFile >= 0)
                               8128                 :         {
 2213 rhaas                    8129 LBC           0 :             pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
 8059 tgl                      8130 UIC           0 :             if (pg_fsync(openLogFile) != 0)
 1223 michael                  8131 ECB             :             {
 1223 michael                  8132 EUB             :                 char        xlogfname[MAXFNAMELEN];
 1223 michael                  8133 ECB             :                 int         save_errno;
                               8134                 : 
 1223 michael                  8135 UIC           0 :                 save_errno = errno;
  520 rhaas                    8136 LBC           0 :                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
 1223 michael                  8137 EUB             :                              wal_segment_size);
 1223 michael                  8138 LBC           0 :                 errno = save_errno;
 7202 tgl                      8139 UIC           0 :                 ereport(PANIC,
                               8140                 :                         (errcode_for_file_access(),
 1223 michael                  8141 ECB             :                          errmsg("could not fsync file \"%s\": %m", xlogfname)));
                               8142                 :             }
                               8143                 : 
 2213 rhaas                    8144 UIC           0 :             pgstat_report_wait_end();
 5441 tgl                      8145 LBC           0 :             if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
 6142 bruce                    8146 UIC           0 :                 XLogFileClose();
                               8147                 :         }
                               8148                 :     }
 8059 tgl                      8149 GIC        1857 : }
 8059 tgl                      8150 ECB             : 
                               8151                 : 
                               8152                 : /*
                               8153                 :  * Issue appropriate kind of fsync (if any) for an XLOG output file.
                               8154                 :  *
                               8155                 :  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
                               8156                 :  * 'segno' is for error reporting purposes.
                               8157                 :  */
                               8158                 : void
  520 rhaas                    8159 GIC      315883 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
 8059 tgl                      8160 ECB             : {
 1223 michael                  8161 CBC      315883 :     char       *msg = NULL;
  761 fujii                    8162 ECB             :     instr_time  start;
                               8163                 : 
  520 rhaas                    8164 GIC      315883 :     Assert(tli != 0);
                               8165                 : 
                               8166                 :     /*
  761 fujii                    8167 ECB             :      * Quick exit if fsync is disabled or write() has already synced the WAL
                               8168                 :      * file.
                               8169                 :      */
  761 fujii                    8170 CBC      315883 :     if (!enableFsync ||
  761 fujii                    8171 LBC           0 :         sync_method == SYNC_METHOD_OPEN ||
                               8172               0 :         sync_method == SYNC_METHOD_OPEN_DSYNC)
  761 fujii                    8173 CBC      315883 :         return;
  761 fujii                    8174 ECB             : 
                               8175                 :     /* Measure I/O timing to sync the WAL file */
  761 fujii                    8176 UIC           0 :     if (track_wal_io_timing)
                               8177               0 :         INSTR_TIME_SET_CURRENT(start);
                               8178                 :     else
   79 andres                   8179 UNC           0 :         INSTR_TIME_SET_ZERO(start);
                               8180                 : 
 1742 michael                  8181 UIC           0 :     pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
 8059 tgl                      8182               0 :     switch (sync_method)
                               8183                 :     {
 7836 bruce                    8184               0 :         case SYNC_METHOD_FSYNC:
 4832 heikki.linnakangas       8185               0 :             if (pg_fsync_no_writethrough(fd) != 0)
 1223 michael                  8186 LBC           0 :                 msg = _("could not fsync file \"%s\": %m");
 8059 tgl                      8187 UIC           0 :             break;
 6533 bruce                    8188 ECB             : #ifdef HAVE_FSYNC_WRITETHROUGH
                               8189                 :         case SYNC_METHOD_FSYNC_WRITETHROUGH:
                               8190                 :             if (pg_fsync_writethrough(fd) != 0)
 1223 michael                  8191                 :                 msg = _("could not fsync write-through file \"%s\": %m");
                               8192                 :             break;
                               8193                 : #endif
 8059 tgl                      8194 LBC           0 :         case SYNC_METHOD_FDATASYNC:
 4832 heikki.linnakangas       8195               0 :             if (pg_fdatasync(fd) != 0)
 1223 michael                  8196               0 :                 msg = _("could not fdatasync file \"%s\": %m");
 8059 tgl                      8197 UIC           0 :             break;
 8059 tgl                      8198 LBC           0 :         case SYNC_METHOD_OPEN:
 5445 magnus                   8199 ECB             :         case SYNC_METHOD_OPEN_DSYNC:
  761 fujii                    8200                 :             /* not reachable */
  761 fujii                    8201 UIC           0 :             Assert(false);
 8059 tgl                      8202 ECB             :             break;
 8059 tgl                      8203 LBC           0 :         default:
 7202 tgl                      8204 UIC           0 :             elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
                               8205                 :             break;
 8059 tgl                      8206 ECB             :     }
                               8207                 : 
 1223 michael                  8208 EUB             :     /* PANIC if failed to fsync */
 1223 michael                  8209 UIC           0 :     if (msg)
                               8210                 :     {
                               8211                 :         char        xlogfname[MAXFNAMELEN];
 1223 michael                  8212 UBC           0 :         int         save_errno = errno;
                               8213                 : 
  520 rhaas                    8214 UIC           0 :         XLogFileName(xlogfname, tli, segno, wal_segment_size);
 1223 michael                  8215               0 :         errno = save_errno;
                               8216               0 :         ereport(PANIC,
                               8217                 :                 (errcode_for_file_access(),
                               8218                 :                  errmsg(msg, xlogfname)));
 1223 michael                  8219 EUB             :     }
                               8220                 : 
 1223 michael                  8221 UBC           0 :     pgstat_report_wait_end();
  761 fujii                    8222 EUB             : 
                               8223                 :     /*
                               8224                 :      * Increment the I/O timing and the number of times WAL files were synced.
                               8225                 :      */
  761 fujii                    8226 UIC           0 :     if (track_wal_io_timing)
                               8227                 :     {
  761 fujii                    8228 EUB             :         instr_time  duration;
                               8229                 : 
  761 fujii                    8230 LBC           0 :         INSTR_TIME_SET_CURRENT(duration);
   10 andres                   8231 UNC           0 :         INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, duration, start);
                               8232                 :     }
                               8233                 : 
  368 andres                   8234 UIC           0 :     PendingWalStats.wal_sync++;
                               8235                 : }
 3941 heikki.linnakangas       8236 ECB             : 
                               8237                 : /*
  368 sfrost                   8238                 :  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
                               8239                 :  * function. It creates the necessary starting checkpoint and constructs the
                               8240                 :  * backup state and tablespace map.
                               8241                 :  *
                               8242                 :  * Input parameters are "state" (the backup state), "fast" (if true, we do
                               8243                 :  * the checkpoint in immediate mode to make it faster), and "tablespaces"
                               8244                 :  * (if non-NULL, indicates a list of tablespaceinfo structs describing the
                               8245                 :  * cluster's tablespaces.).
                               8246                 :  *
                               8247                 :  * The tablespace map contents are appended to passed-in parameter
                               8248                 :  * tablespace_map and the caller is responsible for including it in the backup
                               8249                 :  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
                               8250                 :  * tar format in windows as native windows utilities are not able to create
                               8251                 :  * symlinks while extracting files from tar. However for consistency and
                               8252                 :  * platform-independence, we do it the same way everywhere.
                               8253                 :  *
                               8254                 :  * It fills in "state" with the information required for the backup, such
                               8255                 :  * as the minimum WAL location that must be present to restore from this
                               8256                 :  * backup (starttli) and the corresponding timeline ID (starttli).
                               8257                 :  *
                               8258                 :  * Every successfully started backup must be stopped by calling
                               8259                 :  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
  363 tgl                      8260 EUB             :  * backups active at the same time.
                               8261                 :  *
                               8262                 :  * It is the responsibility of the caller of this function to verify the
 3379 magnus                   8263                 :  * permissions of the calling user!
                               8264                 :  */
                               8265                 : void
  195 michael                  8266 GNC         130 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
                               8267                 :                    BackupState *state, StringInfo tblspcmapfile)
 4473 magnus                   8268 EUB             : {
                               8269                 :     bool        backup_started_in_recovery;
                               8270                 : 
  195 michael                  8271 GNC         130 :     Assert(state != NULL);
 4092 simon                    8272 GIC         130 :     backup_started_in_recovery = RecoveryInProgress();
                               8273                 : 
                               8274                 :     /*
                               8275                 :      * During recovery, we don't need to check WAL level. Because, if WAL
 3955 bruce                    8276 ECB             :      * level is not sufficient, it's impossible to get here during recovery.
                               8277                 :      */
 4092 simon                    8278 CBC         130 :     if (!backup_started_in_recovery && !XLogIsNeeded())
 5674 tgl                      8279 UIC           0 :         ereport(ERROR,
                               8280                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                               8281                 :                  errmsg("WAL level not sufficient for making an online backup"),
                               8282                 :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
                               8283                 : 
 4451 heikki.linnakangas       8284 GIC         130 :     if (strlen(backupidstr) > MAXPGPATH)
                               8285               1 :         ereport(ERROR,
 4451 heikki.linnakangas       8286 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               8287                 :                  errmsg("backup label too long (max %d bytes)",
                               8288                 :                         MAXPGPATH)));
                               8289                 : 
  195 michael                  8290 GNC         129 :     memcpy(state->name, backupidstr, strlen(backupidstr));
                               8291                 : 
                               8292                 :     /*
                               8293                 :      * Mark backup active in shared memory.  We must do full-page WAL writes
                               8294                 :      * during an on-line backup even if not doing so at other times, because
                               8295                 :      * it's quite possible for the backup dump to obtain a "torn" (partially
 6031 bruce                    8296 EUB             :      * written) copy of a database page if it reads the page concurrently with
 3260                          8297                 :      * our write to the same page.  This can be fixed as long as the first
                               8298                 :      * write to the page in the WAL sequence is a full-page write. Hence, we
                               8299                 :      * increment runningBackups then force a CHECKPOINT, to ensure there are
                               8300                 :      * no dirty pages in shared memory that might get dumped while the backup
                               8301                 :      * is in progress without having a corresponding WAL record.  (Once the
                               8302                 :      * backup is complete, we need not force full-page writes anymore, since
                               8303                 :      * we expect that any pages not modified during the backup interval must
                               8304                 :      * have been correctly captured by the backup.)
 6201 tgl                      8305                 :      *
                               8306                 :      * Note that forcing full-page writes has no effect during an online
                               8307                 :      * backup from the standby.
                               8308                 :      *
                               8309                 :      * We must hold all the insertion locks to change the value of
                               8310                 :      * runningBackups, to ensure adequate interlocking against
                               8311                 :      * XLogInsertRecord().
                               8312                 :      */
 3306 heikki.linnakangas       8313 GIC         129 :     WALInsertLockAcquireExclusive();
  368 sfrost                   8314             129 :     XLogCtl->Insert.runningBackups++;
 3306 heikki.linnakangas       8315             129 :     WALInsertLockRelease();
                               8316                 : 
                               8317                 :     /*
                               8318                 :      * Ensure we decrement runningBackups if we fail below. NB -- for this to
                               8319                 :      * work correctly, it is critical that sessionBackupState is only updated
                               8320                 :      * after this block is over.
                               8321                 :      */
  172 alvherre                 8322 GNC         129 :     PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
 6201 tgl                      8323 ECB             :     {
 4382 bruce                    8324 GIC         129 :         bool        gotUniqueStartpoint = false;
 1952 tgl                      8325 ECB             :         DIR        *tblspcdir;
                               8326                 :         struct dirent *de;
                               8327                 :         tablespaceinfo *ti;
 2889 andrew                   8328                 :         int         datadirpathlen;
                               8329                 : 
                               8330                 :         /*
                               8331                 :          * Force an XLOG file switch before the checkpoint, to ensure that the
                               8332                 :          * WAL segment the checkpoint is written to doesn't contain pages with
                               8333                 :          * old timeline IDs.  That would otherwise happen if you called
  368 sfrost                   8334                 :          * pg_backup_start() right after restoring from a PITR archive: the
 4136 tgl                      8335 EUB             :          * first WAL segment containing the startup checkpoint has pages in
 3260 bruce                    8336                 :          * the beginning with the old timeline ID.  That can cause trouble at
 4136 tgl                      8337 ECB             :          * recovery: we won't have a history file covering the old timeline if
                               8338                 :          * pg_wal directory was not included in the base backup and the WAL
                               8339                 :          * archive was cleared too before starting the backup.
 4136 tgl                      8340 EUB             :          *
                               8341                 :          * This also ensures that we have emitted a WAL page header that has
                               8342                 :          * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
                               8343                 :          * Therefore, if a WAL archiver (such as pglesslog) is trying to
                               8344                 :          * compress out removable backup blocks, it won't remove any that
                               8345                 :          * occur after this point.
 4092 simon                    8346                 :          *
                               8347                 :          * During recovery, we skip forcing XLOG file switch, which means that
                               8348                 :          * the backup taken during recovery is not available for the special
                               8349                 :          * recovery case described above.
 4136 tgl                      8350                 :          */
 4092 simon                    8351 GBC         129 :         if (!backup_started_in_recovery)
 2299 andres                   8352 GIC         124 :             RequestXLogSwitch(false);
                               8353                 : 
                               8354                 :         do
                               8355                 :         {
                               8356                 :             bool        checkpointfpw;
                               8357                 : 
 4402 heikki.linnakangas       8358 EUB             :             /*
 3260 bruce                    8359                 :              * Force a CHECKPOINT.  Aside from being necessary to prevent torn
 4382                          8360                 :              * page problems, this guarantees that two successive backup runs
                               8361                 :              * will have different checkpoint positions and hence different
                               8362                 :              * history file names, even if nothing happened in between.
                               8363                 :              *
                               8364                 :              * During recovery, establish a restartpoint if possible. We use
 3955                          8365                 :              * the last restartpoint as the backup starting checkpoint. This
                               8366                 :              * means that two successive backup runs can have same checkpoint
                               8367                 :              * positions.
 4092 simon                    8368                 :              *
                               8369                 :              * Since the fact that we are executing do_pg_backup_start()
                               8370                 :              * during recovery means that checkpointer is running, we can use
                               8371                 :              * RequestCheckpoint() to establish a restartpoint.
                               8372                 :              *
 4382 bruce                    8373                 :              * We use CHECKPOINT_IMMEDIATE only if requested by user (via
                               8374                 :              * passing fast = true).  Otherwise this can take awhile.
                               8375                 :              */
 4402 heikki.linnakangas       8376 GBC         129 :             RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
                               8377                 :                               (fast ? CHECKPOINT_IMMEDIATE : 0));
 6822 tgl                      8378 EUB             : 
 4402 heikki.linnakangas       8379                 :             /*
 4382 bruce                    8380                 :              * Now we need to fetch the checkpoint record location, and also
                               8381                 :              * its REDO pointer.  The oldest point in WAL that would be needed
                               8382                 :              * to restore starting from the checkpoint is precisely the REDO
                               8383                 :              * pointer.
                               8384                 :              */
 4402 heikki.linnakangas       8385 GBC         129 :             LWLockAcquire(ControlFileLock, LW_SHARED);
  195 michael                  8386 GNC         129 :             state->checkpointloc = ControlFile->checkPoint;
                               8387             129 :             state->startpoint = ControlFile->checkPointCopy.redo;
                               8388             129 :             state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
 4092 simon                    8389 GIC         129 :             checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
 4402 heikki.linnakangas       8390 GBC         129 :             LWLockRelease(ControlFileLock);
                               8391                 : 
 4092 simon                    8392 GIC         129 :             if (backup_started_in_recovery)
                               8393                 :             {
 3955 bruce                    8394 EUB             :                 XLogRecPtr  recptr;
 4092 simon                    8395                 : 
                               8396                 :                 /*
                               8397                 :                  * Check to see if all WAL replayed during online backup
 3955 bruce                    8398                 :                  * (i.e., since last restartpoint used as backup starting
                               8399                 :                  * checkpoint) contain full-page writes.
                               8400                 :                  */
 3121 andres                   8401 GIC           5 :                 SpinLockAcquire(&XLogCtl->info_lck);
                               8402               5 :                 recptr = XLogCtl->lastFpwDisableRecPtr;
                               8403               5 :                 SpinLockRelease(&XLogCtl->info_lck);
                               8404                 : 
  195 michael                  8405 GNC           5 :                 if (!checkpointfpw || state->startpoint <= recptr)
 4092 simon                    8406 UIC           0 :                     ereport(ERROR,
                               8407                 :                             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                               8408                 :                              errmsg("WAL generated with full_page_writes=off was replayed "
                               8409                 :                                     "since last restartpoint"),
                               8410                 :                              errhint("This means that the backup being taken on the standby "
                               8411                 :                                      "is corrupt and should not be used. "
                               8412                 :                                      "Enable full_page_writes and run CHECKPOINT on the primary, "
                               8413                 :                                      "and then try an online backup again.")));
                               8414                 : 
                               8415                 :                 /*
                               8416                 :                  * During recovery, since we don't use the end-of-backup WAL
                               8417                 :                  * record and don't write the backup history file, the
                               8418                 :                  * starting WAL location doesn't need to be unique. This means
                               8419                 :                  * that two base backups started at the same time might use
                               8420                 :                  * the same checkpoint as starting locations.
                               8421                 :                  */
 4092 simon                    8422 GIC           5 :                 gotUniqueStartpoint = true;
                               8423                 :             }
                               8424                 : 
                               8425                 :             /*
                               8426                 :              * If two base backups are started at the same time (in WAL sender
                               8427                 :              * processes), we need to make sure that they use different
                               8428                 :              * checkpoints as starting locations, because we use the starting
                               8429                 :              * WAL location as a unique identifier for the base backup in the
 4382 bruce                    8430 ECB             :              * end-of-backup WAL record and when we write the backup history
                               8431                 :              * file. Perhaps it would be better generate a separate unique ID
                               8432                 :              * for each backup instead of forcing another checkpoint, but
                               8433                 :              * taking a checkpoint right after another is not that expensive
                               8434                 :              * either because only few buffers have been dirtied yet.
 4402 heikki.linnakangas       8435                 :              */
 3306 heikki.linnakangas       8436 CBC         129 :             WALInsertLockAcquireExclusive();
  195 michael                  8437 GNC         129 :             if (XLogCtl->Insert.lastBackupStart < state->startpoint)
                               8438                 :             {
                               8439             129 :                 XLogCtl->Insert.lastBackupStart = state->startpoint;
 4402 heikki.linnakangas       8440 GIC         129 :                 gotUniqueStartpoint = true;
                               8441                 :             }
 3306 heikki.linnakangas       8442 CBC         129 :             WALInsertLockRelease();
 4382 bruce                    8443 GBC         129 :         } while (!gotUniqueStartpoint);
                               8444                 : 
 2889 andrew                   8445 ECB             :         /*
  363 tgl                      8446                 :          * Construct tablespace_map file.
                               8447                 :          */
 2889 andrew                   8448 GIC         129 :         datadirpathlen = strlen(DataDir);
                               8449                 : 
                               8450                 :         /* Collect information about all tablespaces */
 1952 tgl                      8451 CBC         129 :         tblspcdir = AllocateDir("pg_tblspc");
 2889 andrew                   8452 GIC         414 :         while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
                               8453                 :         {
                               8454                 :             char        fullpath[MAXPGPATH + 10];
                               8455                 :             char        linkpath[MAXPGPATH];
                               8456             285 :             char       *relpath = NULL;
                               8457                 :             int         rllen;
                               8458                 :             StringInfoData escapedpath;
                               8459                 :             char       *s;
                               8460                 : 
                               8461                 :             /* Skip anything that doesn't look like a tablespace */
  753 tgl                      8462             285 :             if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
 2889 andrew                   8463             269 :                 continue;
                               8464                 : 
                               8465              27 :             snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
                               8466                 : 
                               8467                 :             /*
                               8468                 :              * Skip anything that isn't a symlink/junction.  For testing only,
                               8469                 :              * we sometimes use allow_in_place_tablespaces to create
                               8470                 :              * directories directly under pg_tblspc, which would fail below.
                               8471                 :              */
  390 tmunro                   8472              27 :             if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
                               8473              11 :                 continue;
  390 tmunro                   8474 ECB             : 
 2889 andrew                   8475 CBC          16 :             rllen = readlink(fullpath, linkpath, sizeof(linkpath));
 2889 andrew                   8476 GIC          16 :             if (rllen < 0)
                               8477                 :             {
 2889 andrew                   8478 UIC           0 :                 ereport(WARNING,
                               8479                 :                         (errmsg("could not read symbolic link \"%s\": %m",
                               8480                 :                                 fullpath)));
                               8481               0 :                 continue;
 2889 andrew                   8482 ECB             :             }
 2889 andrew                   8483 GIC          16 :             else if (rllen >= sizeof(linkpath))
 2889 andrew                   8484 ECB             :             {
 2889 andrew                   8485 UIC           0 :                 ereport(WARNING,
                               8486                 :                         (errmsg("symbolic link \"%s\" target is too long",
                               8487                 :                                 fullpath)));
                               8488               0 :                 continue;
                               8489                 :             }
 2889 andrew                   8490 GIC          16 :             linkpath[rllen] = '\0';
                               8491                 : 
                               8492                 :             /*
                               8493                 :              * Build a backslash-escaped version of the link path to include
                               8494                 :              * in the tablespace map file.
                               8495                 :              */
  753 tgl                      8496              16 :             initStringInfo(&escapedpath);
                               8497             444 :             for (s = linkpath; *s; s++)
                               8498                 :             {
                               8499             428 :                 if (*s == '\n' || *s == '\r' || *s == '\\')
  753 tgl                      8500 UIC           0 :                     appendStringInfoChar(&escapedpath, '\\');
  753 tgl                      8501 GIC         428 :                 appendStringInfoChar(&escapedpath, *s);
                               8502                 :             }
                               8503                 : 
                               8504                 :             /*
                               8505                 :              * Relpath holds the relative path of the tablespace directory
                               8506                 :              * when it's located within PGDATA, or NULL if it's located
                               8507                 :              * elsewhere.
                               8508                 :              */
 2889 andrew                   8509              16 :             if (rllen > datadirpathlen &&
                               8510               1 :                 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
 2889 andrew                   8511 LBC           0 :                 IS_DIR_SEP(linkpath[datadirpathlen]))
                               8512               0 :                 relpath = linkpath + datadirpathlen + 1;
                               8513                 : 
 2889 andrew                   8514 GIC          16 :             ti = palloc(sizeof(tablespaceinfo));
                               8515              16 :             ti->oid = pstrdup(de->d_name);
  753 tgl                      8516              16 :             ti->path = pstrdup(linkpath);
 2889 andrew                   8517              16 :             ti->rpath = relpath ? pstrdup(relpath) : NULL;
 1026 rhaas                    8518              16 :             ti->size = -1;
                               8519                 : 
 2878 bruce                    8520              16 :             if (tablespaces)
                               8521              16 :                 *tablespaces = lappend(*tablespaces, ti);
                               8522                 : 
  753 tgl                      8523              16 :             appendStringInfo(tblspcmapfile, "%s %s\n",
                               8524                 :                              ti->oid, escapedpath.data);
                               8525                 : 
                               8526              16 :             pfree(escapedpath.data);
                               8527                 :         }
 1952                          8528             129 :         FreeDir(tblspcdir);
                               8529                 : 
  195 michael                  8530 GNC         129 :         state->starttime = (pg_time_t) time(NULL);
                               8531                 :     }
  172 alvherre                 8532             129 :     PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
                               8533                 : 
  195 michael                  8534             129 :     state->started_in_recovery = backup_started_in_recovery;
  195 michael                  8535 ECB             : 
 2273 fujii                    8536                 :     /*
                               8537                 :      * Mark that the start phase has correctly finished for the backup.
                               8538                 :      */
  368 sfrost                   8539 GBC         129 :     sessionBackupState = SESSION_BACKUP_RUNNING;
 6823 tgl                      8540 GIC         129 : }
                               8541                 : 
                               8542                 : /*
                               8543                 :  * Utility routine to fetch the session-level status of a backup running.
                               8544                 :  */
 2207 teodor                   8545 ECB             : SessionBackupState
 2207 teodor                   8546 CBC         149 : get_backup_status(void)
                               8547                 : {
                               8548             149 :     return sessionBackupState;
 2207 teodor                   8549 ECB             : }
                               8550                 : 
 4451 heikki.linnakangas       8551                 : /*
  368 sfrost                   8552                 :  * do_pg_backup_stop
                               8553                 :  *
                               8554                 :  * Utility function called at the end of an online backup.  It creates history
                               8555                 :  * file (if required), resets sessionBackupState and so on.  It can optionally
                               8556                 :  * wait for WAL segments to be archived.
                               8557                 :  *
                               8558                 :  * "state" is filled with the information necessary to restore from this
                               8559                 :  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
                               8560                 :  *
 3379 magnus                   8561                 :  * It is the responsibility of the caller of this function to verify the
                               8562                 :  * permissions of the calling user!
                               8563                 :  */
                               8564                 : void
  195 michael                  8565 GNC         122 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
 6823 tgl                      8566 ECB             : {
  195 michael                  8567 GNC         122 :     bool        backup_stopped_in_recovery = false;
 6822 tgl                      8568 ECB             :     char        histfilepath[MAXPGPATH];
                               8569                 :     char        lastxlogfilename[MAXFNAMELEN];
                               8570                 :     char        histfilename[MAXFNAMELEN];
                               8571                 :     XLogSegNo   _logSegNo;
                               8572                 :     FILE       *fp;
 5482 bruce                    8573                 :     int         seconds_before_warning;
 5482 bruce                    8574 CBC         122 :     int         waits = 0;
 4739 simon                    8575 GIC         122 :     bool        reported_waiting = false;
                               8576                 : 
  195 michael                  8577 GNC         122 :     Assert(state != NULL);
                               8578                 : 
                               8579             122 :     backup_stopped_in_recovery = RecoveryInProgress();
 6823 tgl                      8580 EUB             : 
                               8581                 :     /*
 3955 bruce                    8582 ECB             :      * During recovery, we don't need to check WAL level. Because, if WAL
                               8583                 :      * level is not sufficient, it's impossible to get here during recovery.
 4092 simon                    8584 EUB             :      */
  195 michael                  8585 GNC         122 :     if (!backup_stopped_in_recovery && !XLogIsNeeded())
 5326 tgl                      8586 UIC           0 :         ereport(ERROR,
 5326 tgl                      8587 EUB             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                               8588                 :                  errmsg("WAL level not sufficient for making an online backup"),
 2596 peter_e                  8589 ECB             :                  errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
                               8590                 : 
                               8591                 :     /*
                               8592                 :      * OK to update backup counter and session-level lock.
                               8593                 :      *
                               8594                 :      * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
                               8595                 :      * otherwise they can be updated inconsistently, which might cause
 1937 fujii                    8596                 :      * do_pg_abort_backup() to fail.
                               8597                 :      */
 2273 fujii                    8598 CBC         122 :     WALInsertLockAcquireExclusive();
 4451 heikki.linnakangas       8599 EUB             : 
  368 sfrost                   8600 ECB             :     /*
                               8601                 :      * It is expected that each do_pg_backup_start() call is matched by
                               8602                 :      * exactly one do_pg_backup_stop() call.
                               8603                 :      */
  368 sfrost                   8604 GIC         122 :     Assert(XLogCtl->Insert.runningBackups > 0);
                               8605             122 :     XLogCtl->Insert.runningBackups--;
                               8606                 : 
                               8607                 :     /*
 1937 fujii                    8608 ECB             :      * Clean up session-level lock.
                               8609                 :      *
 1809 tgl                      8610                 :      * You might think that WALInsertLockRelease() can be called before
                               8611                 :      * cleaning up session-level lock because session-level lock doesn't need
                               8612                 :      * to be protected with WAL insertion lock. But since
                               8613                 :      * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
                               8614                 :      * cleaned up before it.
 1937 fujii                    8615                 :      */
 2207 teodor                   8616 GIC         122 :     sessionBackupState = SESSION_BACKUP_NONE;
 2207 teodor                   8617 ECB             : 
 1937 fujii                    8618 GIC         122 :     WALInsertLockRelease();
                               8619                 : 
 6823 tgl                      8620 ECB             :     /*
                               8621                 :      * If we are taking an online backup from the standby, we confirm that the
                               8622                 :      * standby has not been promoted during the backup.
                               8623                 :      */
  195 michael                  8624 GNC         122 :     if (state->started_in_recovery && !backup_stopped_in_recovery)
 4092 simon                    8625 UIC           0 :         ereport(ERROR,
                               8626                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                               8627                 :                  errmsg("the standby was promoted during online backup"),
                               8628                 :                  errhint("This means that the backup being taken is corrupt "
                               8629                 :                          "and should not be used. "
                               8630                 :                          "Try taking another online backup.")));
                               8631                 : 
                               8632                 :     /*
                               8633                 :      * During recovery, we don't write an end-of-backup record. We assume that
                               8634                 :      * pg_control was backed up last and its minimum recovery point can be
                               8635                 :      * available as the backup end location. Since we don't have an
                               8636                 :      * end-of-backup record, we use the pg_control value to check whether
                               8637                 :      * we've reached the end of backup when starting recovery from this
                               8638                 :      * backup. We have no way of checking if pg_control wasn't backed up last
                               8639                 :      * however.
 4092 simon                    8640 ECB             :      *
                               8641                 :      * We don't force a switch to new WAL file but it is still possible to
 2073 rhaas                    8642                 :      * wait for all the required files to be archived if waitforarchive is
                               8643                 :      * true. This is okay if we use the backup to start a standby and fetch
                               8644                 :      * the missing WAL using streaming replication. But in the case of an
                               8645                 :      * archive recovery, a user should set waitforarchive to true and wait for
                               8646                 :      * them to be archived to ensure that all the required files are
                               8647                 :      * available.
                               8648                 :      *
 4092 simon                    8649                 :      * We return the current minimum recovery point as the backup end
 3839 heikki.linnakangas       8650                 :      * location. Note that it can be greater than the exact backup end
                               8651                 :      * location if the minimum recovery point is updated after the backup of
 3955 bruce                    8652                 :      * pg_control. This is harmless for current uses.
                               8653                 :      *
 4092 simon                    8654                 :      * XXX currently a backup history file is for informational and debug
                               8655                 :      * purposes only. It's not essential for an online backup. Furthermore,
                               8656                 :      * even if it's created, it will not be archived during recovery because
                               8657                 :      * an archiver is not invoked. So it doesn't seem worthwhile to write a
                               8658                 :      * backup history file during recovery.
                               8659                 :      */
  195 michael                  8660 GNC         122 :     if (backup_stopped_in_recovery)
 4092 simon                    8661 EUB             :     {
                               8662                 :         XLogRecPtr  recptr;
                               8663                 : 
                               8664                 :         /*
                               8665                 :          * Check to see if all WAL replayed during online backup contain
                               8666                 :          * full-page writes.
                               8667                 :          */
 3121 andres                   8668 GIC           5 :         SpinLockAcquire(&XLogCtl->info_lck);
                               8669               5 :         recptr = XLogCtl->lastFpwDisableRecPtr;
                               8670               5 :         SpinLockRelease(&XLogCtl->info_lck);
                               8671                 : 
  195 michael                  8672 GNC           5 :         if (state->startpoint <= recptr)
 4092 simon                    8673 LBC           0 :             ereport(ERROR,
                               8674                 :                     (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                               8675                 :                      errmsg("WAL generated with full_page_writes=off was replayed "
                               8676                 :                             "during online backup"),
                               8677                 :                      errhint("This means that the backup being taken on the standby "
                               8678                 :                              "is corrupt and should not be used. "
 1029 andres                   8679 ECB             :                              "Enable full_page_writes and run CHECKPOINT on the primary, "
 2118 tgl                      8680                 :                              "and then try an online backup again.")));
                               8681                 : 
                               8682                 : 
 4092 simon                    8683 GIC           5 :         LWLockAcquire(ControlFileLock, LW_SHARED);
  195 michael                  8684 GNC           5 :         state->stoppoint = ControlFile->minRecoveryPoint;
                               8685               5 :         state->stoptli = ControlFile->minRecoveryPointTLI;
 4092 simon                    8686 GIC           5 :         LWLockRelease(ControlFileLock);
                               8687                 :     }
                               8688                 :     else
                               8689                 :     {
                               8690                 :         char       *history_file;
                               8691                 : 
                               8692                 :         /*
 2073 rhaas                    8693 ECB             :          * Write the backup-end xlog record
                               8694                 :          */
 2073 rhaas                    8695 CBC         117 :         XLogBeginInsert();
  195 michael                  8696 GNC         117 :         XLogRegisterData((char *) (&state->startpoint),
                               8697                 :                          sizeof(state->startpoint));
                               8698             117 :         state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
                               8699                 : 
                               8700                 :         /*
                               8701                 :          * Given that we're not in recovery, InsertTimeLineID is set and can't
  520 rhaas                    8702 ECB             :          * change, so we can read it without a lock.
  520 rhaas                    8703 EUB             :          */
  195 michael                  8704 GNC         117 :         state->stoptli = XLogCtl->InsertTimeLineID;
                               8705                 : 
                               8706                 :         /*
                               8707                 :          * Force a switch to a new xlog segment file, so that the backup is
                               8708                 :          * valid as soon as archiver moves out the current segment file.
                               8709                 :          */
 2073 rhaas                    8710 GIC         117 :         RequestXLogSwitch(false);
                               8711                 : 
  195 michael                  8712 GNC         117 :         state->stoptime = (pg_time_t) time(NULL);
                               8713                 : 
                               8714                 :         /*
                               8715                 :          * Write the backup history file
                               8716                 :          */
                               8717             117 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
                               8718             117 :         BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
                               8719                 :                               state->startpoint, wal_segment_size);
 2073 rhaas                    8720 GIC         117 :         fp = AllocateFile(histfilepath, "w");
                               8721             117 :         if (!fp)
 2073 rhaas                    8722 UIC           0 :             ereport(ERROR,
                               8723                 :                     (errcode_for_file_access(),
                               8724                 :                      errmsg("could not create file \"%s\": %m",
                               8725                 :                             histfilepath)));
                               8726                 : 
                               8727                 :         /* Build and save the contents of the backup history file */
  195 michael                  8728 GNC         117 :         history_file = build_backup_content(state, true);
  194                          8729             117 :         fprintf(fp, "%s", history_file);
  195                          8730             117 :         pfree(history_file);
                               8731                 : 
 2073 rhaas                    8732 GIC         117 :         if (fflush(fp) || ferror(fp) || FreeFile(fp))
 2073 rhaas                    8733 LBC           0 :             ereport(ERROR,
 2073 rhaas                    8734 ECB             :                     (errcode_for_file_access(),
                               8735                 :                      errmsg("could not write file \"%s\": %m",
                               8736                 :                             histfilepath)));
 6797 bruce                    8737                 : 
 2073 rhaas                    8738 EUB             :         /*
                               8739                 :          * Clean out any no-longer-needed history files.  As a side effect,
                               8740                 :          * this will post a .ready file for the newly created history file,
                               8741                 :          * notifying the archiver that history file may be archived
                               8742                 :          * immediately.
                               8743                 :          */
 2073 rhaas                    8744 GIC         117 :         CleanupBackupHistory();
                               8745                 :     }
                               8746                 : 
                               8747                 :     /*
 4728 tgl                      8748 ECB             :      * If archiving is enabled, wait for all the required WAL files to be
 4660 bruce                    8749                 :      * archived before returning. If archiving isn't enabled, the required WAL
                               8750                 :      * needs to be transported via streaming replication (hopefully with
  993 fujii                    8751                 :      * wal_keep_size set high enough), or some more exotic mechanism like
                               8752                 :      * polling and copying files from pg_wal with script. We have no knowledge
                               8753                 :      * of those mechanisms, so it's up to the user to ensure that he gets all
                               8754                 :      * the required WAL.
                               8755                 :      *
                               8756                 :      * We wait until both the last WAL file filled during backup and the
                               8757                 :      * history file have been archived, and assume that the alphabetic sorting
                               8758                 :      * property of the WAL files ensures any earlier WAL files are safely
                               8759                 :      * archived as well.
 5482 bruce                    8760                 :      *
 5050                          8761                 :      * We wait forever, since archive_command is supposed to work and we
                               8762                 :      * assume the admin wanted his backup to work completely. If you don't
 2209 sfrost                   8763                 :      * wish to wait, then either waitforarchive should be passed in as false,
                               8764                 :      * or you can set statement_timeout.  Also, some notices are issued to
                               8765                 :      * clue in anyone who might be doing this interactively.
                               8766                 :      */
                               8767                 : 
 2073 rhaas                    8768 GIC         122 :     if (waitforarchive &&
  195 michael                  8769 GNC           8 :         ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
                               8770               1 :          (backup_stopped_in_recovery && XLogArchivingAlways())))
                               8771                 :     {
                               8772               2 :         XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
                               8773               2 :         XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
                               8774                 :                      wal_segment_size);
                               8775                 : 
                               8776               2 :         XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
                               8777               2 :         BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
                               8778                 :                               state->startpoint, wal_segment_size);
                               8779                 : 
 4660 bruce                    8780 GIC           2 :         seconds_before_warning = 60;
                               8781               2 :         waits = 0;
                               8782                 : 
 4660 bruce                    8783 CBC           6 :         while (XLogArchiveIsBusy(lastxlogfilename) ||
                               8784               2 :                XLogArchiveIsBusy(histfilename))
                               8785                 :         {
                               8786               2 :             CHECK_FOR_INTERRUPTS();
 4739 simon                    8787 ECB             : 
 4660 bruce                    8788 GBC           2 :             if (!reported_waiting && waits > 5)
                               8789                 :             {
 4660 bruce                    8790 UIC           0 :                 ereport(NOTICE,
                               8791                 :                         (errmsg("base backup done, waiting for required WAL segments to be archived")));
                               8792               0 :                 reported_waiting = true;
                               8793                 :             }
 5482 bruce                    8794 ECB             : 
  642 michael                  8795 CBC           2 :             (void) WaitLatch(MyLatch,
  642 michael                  8796 ECB             :                              WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
                               8797                 :                              1000L,
                               8798                 :                              WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
  642 michael                  8799 GBC           2 :             ResetLatch(MyLatch);
                               8800                 : 
 4660 bruce                    8801 GIC           2 :             if (++waits >= seconds_before_warning)
                               8802                 :             {
 4660 bruce                    8803 UIC           0 :                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
                               8804               0 :                 ereport(WARNING,
                               8805                 :                         (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
                               8806                 :                                 waits),
                               8807                 :                          errhint("Check that your archive_command is executing properly.  "
                               8808                 :                                  "You can safely cancel this backup, "
                               8809                 :                                  "but the database backup will not be usable without all the WAL segments.")));
 4660 bruce                    8810 ECB             :             }
                               8811                 :         }
                               8812                 : 
 4660 bruce                    8813 GIC           2 :         ereport(NOTICE,
                               8814                 :                 (errmsg("all required WAL segments have been archived")));
                               8815                 :     }
 4442 magnus                   8816             120 :     else if (waitforarchive)
 4728 tgl                      8817               6 :         ereport(NOTICE,
                               8818                 :                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
 4473 magnus                   8819             122 : }
                               8820                 : 
                               8821                 : 
                               8822                 : /*
                               8823                 :  * do_pg_abort_backup: abort a running backup
                               8824                 :  *
                               8825                 :  * This does just the most basic steps of do_pg_backup_stop(), by taking the
                               8826                 :  * system out of backup mode, thus making it a lot more safe to call from
 4473 magnus                   8827 ECB             :  * an error handler.
 4451 heikki.linnakangas       8828                 :  *
                               8829                 :  * 'arg' indicates that it's being called during backup setup; so
                               8830                 :  * sessionBackupState has not been modified yet, but runningBackups has
                               8831                 :  * already been incremented.  When it's false, then it's invoked as a
                               8832                 :  * before_shmem_exit handler, and therefore we must not change state
                               8833                 :  * unless sessionBackupState indicates that a backup is actually running.
 1207 rhaas                    8834                 :  *
                               8835                 :  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
                               8836                 :  * before_shmem_exit handler, hence the odd-looking signature.
                               8837                 :  */
 4473 magnus                   8838                 : void
 1207 rhaas                    8839 CBC           9 : do_pg_abort_backup(int code, Datum arg)
                               8840                 : {
  172 alvherre                 8841 GNC           9 :     bool        during_backup_start = DatumGetBool(arg);
 1207 rhaas                    8842 ECB             : 
                               8843                 :     /* If called during backup start, there shouldn't be one already running */
  167 alvherre                 8844 GNC           9 :     Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
 1937 fujii                    8845 ECB             : 
  172 alvherre                 8846 GNC           9 :     if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
 4451 heikki.linnakangas       8847 EUB             :     {
  172 alvherre                 8848 GNC           7 :         WALInsertLockAcquireExclusive();
                               8849               7 :         Assert(XLogCtl->Insert.runningBackups > 0);
                               8850               7 :         XLogCtl->Insert.runningBackups--;
                               8851                 : 
                               8852               7 :         sessionBackupState = SESSION_BACKUP_NONE;
                               8853               7 :         WALInsertLockRelease();
                               8854                 : 
                               8855               7 :         if (!during_backup_start)
                               8856               7 :             ereport(WARNING,
                               8857                 :                     errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
                               8858                 :     }
 1207 rhaas                    8859 GIC           9 : }
 1207 rhaas                    8860 EUB             : 
                               8861                 : /*
                               8862                 :  * Register a handler that will warn about unterminated backups at end of
                               8863                 :  * session, unless this has already been done.
                               8864                 :  */
                               8865                 : void
 1207 rhaas                    8866 GIC           4 : register_persistent_abort_backup_handler(void)
                               8867                 : {
                               8868                 :     static bool already_done = false;
                               8869                 : 
 1207 rhaas                    8870 CBC           4 :     if (already_done)
 1207 rhaas                    8871 GIC           1 :         return;
  172 alvherre                 8872 GNC           3 :     before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
 1207 rhaas                    8873 CBC           3 :     already_done = true;
 6823 tgl                      8874 ECB             : }
                               8875                 : 
 4832 heikki.linnakangas       8876                 : /*
                               8877                 :  * Get latest WAL insert pointer
                               8878                 :  */
                               8879                 : XLogRecPtr
 4106 heikki.linnakangas       8880 GIC        4968 : GetXLogInsertRecPtr(void)
                               8881                 : {
 3121 andres                   8882            4968 :     XLogCtlInsert *Insert = &XLogCtl->Insert;
                               8883                 :     uint64      current_bytepos;
                               8884                 : 
 3562 heikki.linnakangas       8885            4968 :     SpinLockAcquire(&Insert->insertpos_lck);
                               8886            4968 :     current_bytepos = Insert->CurrBytePos;
                               8887            4968 :     SpinLockRelease(&Insert->insertpos_lck);
                               8888                 : 
                               8889            4968 :     return XLogBytePosToRecPtr(current_bytepos);
                               8890                 : }
                               8891                 : 
                               8892                 : /*
                               8893                 :  * Get latest WAL write pointer
                               8894                 :  */
                               8895                 : XLogRecPtr
  417 heikki.linnakangas       8896 CBC        1029 : GetXLogWriteRecPtr(void)
                               8897                 : {
                               8898            1029 :     SpinLockAcquire(&XLogCtl->info_lck);
  417 heikki.linnakangas       8899 GIC        1029 :     LogwrtResult = XLogCtl->LogwrtResult;
                               8900            1029 :     SpinLockRelease(&XLogCtl->info_lck);
 2889 andrew                   8901 ECB             : 
  417 heikki.linnakangas       8902 GIC        1029 :     return LogwrtResult.Write;
 2889 andrew                   8903 ECB             : }
                               8904                 : 
 6225 tgl                      8905                 : /*
  417 heikki.linnakangas       8906                 :  * Returns the redo pointer of the last checkpoint or restartpoint. This is
                               8907                 :  * the oldest point in WAL that we still need, if we have to restart recovery.
                               8908                 :  */
                               8909                 : void
  417 heikki.linnakangas       8910 CBC          53 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
                               8911                 : {
                               8912              53 :     LWLockAcquire(ControlFileLock, LW_SHARED);
                               8913              53 :     *oldrecptr = ControlFile->checkPointCopy.redo;
  417 heikki.linnakangas       8914 GIC          53 :     *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
                               8915              53 :     LWLockRelease(ControlFileLock);
 6225 tgl                      8916 CBC          53 : }
                               8917                 : 
                               8918                 : /* Thin wrapper around ShutdownWalRcv(). */
                               8919                 : void
  650 noah                     8920 GIC        1283 : XLogShutdownWalRcv(void)
                               8921                 : {
                               8922            1283 :     ShutdownWalRcv();
  650 noah                     8923 ECB             : 
  650 noah                     8924 GIC        1283 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                               8925            1283 :     XLogCtl->InstallXLogFileSegmentActive = false;
                               8926            1283 :     LWLockRelease(ControlFileLock);
  650 noah                     8927 CBC        1283 : }
  650 noah                     8928 ECB             : 
  417 heikki.linnakangas       8929                 : /* Enable WAL file recycling and preallocation. */
 2769 fujii                    8930                 : void
  417 heikki.linnakangas       8931 GIC        1572 : SetInstallXLogFileSegmentActive(void)
                               8932                 : {
                               8933            1572 :     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                               8934            1572 :     XLogCtl->InstallXLogFileSegmentActive = true;
                               8935            1572 :     LWLockRelease(ControlFileLock);
 2769 fujii                    8936            1572 : }
 2769 fujii                    8937 ECB             : 
                               8938                 : bool
  417 heikki.linnakangas       8939 CBC          24 : IsInstallXLogFileSegmentActive(void)
                               8940                 : {
                               8941                 :     bool        result;
 3722 simon                    8942 ECB             : 
  417 heikki.linnakangas       8943 CBC          24 :     LWLockAcquire(ControlFileLock, LW_SHARED);
                               8944              24 :     result = XLogCtl->InstallXLogFileSegmentActive;
  417 heikki.linnakangas       8945 GIC          24 :     LWLockRelease(ControlFileLock);
 4436 rhaas                    8946 ECB             : 
  417 heikki.linnakangas       8947 GIC          24 :     return result;
                               8948                 : }
                               8949                 : 
                               8950                 : /*
                               8951                 :  * Update the WalWriterSleeping flag.
                               8952                 :  */
 3988 tgl                      8953 ECB             : void
 3988 tgl                      8954 GIC         366 : SetWalWriterSleeping(bool sleeping)
 3988 tgl                      8955 ECB             : {
 3121 andres                   8956 CBC         366 :     SpinLockAcquire(&XLogCtl->info_lck);
                               8957             366 :     XLogCtl->WalWriterSleeping = sleeping;
 3121 andres                   8958 GIC         366 :     SpinLockRelease(&XLogCtl->info_lck);
 3988 tgl                      8959 CBC         366 : }

Generated by: LCOV version v1.16-55-g56c0a2a