Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlog.c
4 : * PostgreSQL write-ahead log manager
5 : *
6 : * The Write-Ahead Log (WAL) functionality is split into several source
7 : * files, in addition to this one:
8 : *
9 : * xloginsert.c - Functions for constructing WAL records
10 : * xlogrecovery.c - WAL recovery and standby code
11 : * xlogreader.c - Facility for reading WAL files and parsing WAL records
12 : * xlogutils.c - Helper functions for WAL redo routines
13 : *
14 : * This file contains functions for coordinating database startup and
15 : * checkpointing, and managing the write-ahead log buffers when the
16 : * system is running.
17 : *
18 : * StartupXLOG() is the main entry point of the startup process. It
19 : * coordinates database startup, performing WAL recovery, and the
20 : * transition from WAL recovery into normal operations.
21 : *
22 : * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
23 : * callers should not call this directly, but use the functions in
24 : * xloginsert.c to construct the WAL record. XLogFlush() can be used
25 : * to force the WAL to disk.
26 : *
27 : * In addition to those, there are many other functions for interrogating
28 : * the current system state, and for starting/stopping backups.
29 : *
30 : *
31 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : * src/backend/access/transam/xlog.c
35 : *
36 : *-------------------------------------------------------------------------
37 : */
38 :
39 : #include "postgres.h"
40 :
41 : #include <ctype.h>
42 : #include <math.h>
43 : #include <time.h>
44 : #include <fcntl.h>
45 : #include <sys/stat.h>
46 : #include <sys/time.h>
47 : #include <unistd.h>
48 :
49 : #include "access/clog.h"
50 : #include "access/commit_ts.h"
51 : #include "access/heaptoast.h"
52 : #include "access/multixact.h"
53 : #include "access/rewriteheap.h"
54 : #include "access/subtrans.h"
55 : #include "access/timeline.h"
56 : #include "access/transam.h"
57 : #include "access/twophase.h"
58 : #include "access/xact.h"
59 : #include "access/xlog_internal.h"
60 : #include "access/xlogarchive.h"
61 : #include "access/xloginsert.h"
62 : #include "access/xlogprefetcher.h"
63 : #include "access/xlogreader.h"
64 : #include "access/xlogrecovery.h"
65 : #include "access/xlogutils.h"
66 : #include "backup/basebackup.h"
67 : #include "catalog/catversion.h"
68 : #include "catalog/pg_control.h"
69 : #include "catalog/pg_database.h"
70 : #include "common/controldata_utils.h"
71 : #include "common/file_utils.h"
72 : #include "executor/instrument.h"
73 : #include "miscadmin.h"
74 : #include "pg_trace.h"
75 : #include "pgstat.h"
76 : #include "port/atomics.h"
77 : #include "port/pg_iovec.h"
78 : #include "postmaster/bgwriter.h"
79 : #include "postmaster/startup.h"
80 : #include "postmaster/walwriter.h"
81 : #include "replication/logical.h"
82 : #include "replication/origin.h"
83 : #include "replication/slot.h"
84 : #include "replication/snapbuild.h"
85 : #include "replication/walreceiver.h"
86 : #include "replication/walsender.h"
87 : #include "storage/bufmgr.h"
88 : #include "storage/fd.h"
89 : #include "storage/ipc.h"
90 : #include "storage/large_object.h"
91 : #include "storage/latch.h"
92 : #include "storage/pmsignal.h"
93 : #include "storage/predicate.h"
94 : #include "storage/proc.h"
95 : #include "storage/procarray.h"
96 : #include "storage/reinit.h"
97 : #include "storage/smgr.h"
98 : #include "storage/spin.h"
99 : #include "storage/sync.h"
100 : #include "utils/guc_hooks.h"
101 : #include "utils/guc_tables.h"
102 : #include "utils/memutils.h"
103 : #include "utils/ps_status.h"
104 : #include "utils/relmapper.h"
105 : #include "utils/pg_rusage.h"
106 : #include "utils/snapmgr.h"
107 : #include "utils/timeout.h"
108 : #include "utils/timestamp.h"
109 : #include "utils/varlena.h"
110 :
111 : extern uint32 bootstrap_data_checksum_version;
112 :
113 : /* timeline ID to be used when bootstrapping */
114 : #define BootstrapTimeLineID 1
115 :
116 : /* User-settable parameters */
117 : int max_wal_size_mb = 1024; /* 1 GB */
118 : int min_wal_size_mb = 80; /* 80 MB */
119 : int wal_keep_size_mb = 0;
120 : int XLOGbuffers = -1;
121 : int XLogArchiveTimeout = 0;
122 : int XLogArchiveMode = ARCHIVE_MODE_OFF;
123 : char *XLogArchiveCommand = NULL;
124 : bool EnableHotStandby = false;
125 : bool fullPageWrites = true;
126 : bool wal_log_hints = false;
127 : int wal_compression = WAL_COMPRESSION_NONE;
128 : char *wal_consistency_checking_string = NULL;
129 : bool *wal_consistency_checking = NULL;
130 : bool wal_init_zero = true;
131 : bool wal_recycle = true;
132 : bool log_checkpoints = true;
133 : int sync_method = DEFAULT_SYNC_METHOD;
134 : int wal_level = WAL_LEVEL_REPLICA;
135 : int CommitDelay = 0; /* precommit delay in microseconds */
136 : int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
137 : int wal_retrieve_retry_interval = 5000;
138 : int max_slot_wal_keep_size_mb = -1;
139 : int wal_decode_buffer_size = 512 * 1024;
140 : bool track_wal_io_timing = false;
141 :
142 : #ifdef WAL_DEBUG
143 : bool XLOG_DEBUG = false;
144 : #endif
145 :
146 : int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
147 :
148 : /*
149 : * Number of WAL insertion locks to use. A higher value allows more insertions
150 : * to happen concurrently, but adds some CPU overhead to flushing the WAL,
151 : * which needs to iterate all the locks.
152 : */
153 : #define NUM_XLOGINSERT_LOCKS 8
154 :
155 : /*
156 : * Max distance from last checkpoint, before triggering a new xlog-based
157 : * checkpoint.
158 : */
159 : int CheckPointSegments;
160 :
161 : /* Estimated distance between checkpoints, in bytes */
162 : static double CheckPointDistanceEstimate = 0;
163 : static double PrevCheckPointDistance = 0;
164 :
165 : /*
166 : * Track whether there were any deferred checks for custom resource managers
167 : * specified in wal_consistency_checking.
168 : */
169 : static bool check_wal_consistency_checking_deferred = false;
170 :
171 : /*
172 : * GUC support
173 : */
174 : const struct config_enum_entry sync_method_options[] = {
175 : {"fsync", SYNC_METHOD_FSYNC, false},
176 : #ifdef HAVE_FSYNC_WRITETHROUGH
177 : {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
178 : #endif
179 : {"fdatasync", SYNC_METHOD_FDATASYNC, false},
180 : #ifdef O_SYNC
181 : {"open_sync", SYNC_METHOD_OPEN, false},
182 : #endif
183 : #ifdef O_DSYNC
184 : {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
185 : #endif
186 : {NULL, 0, false}
187 : };
188 :
189 :
190 : /*
191 : * Although only "on", "off", and "always" are documented,
192 : * we accept all the likely variants of "on" and "off".
193 : */
194 : const struct config_enum_entry archive_mode_options[] = {
195 : {"always", ARCHIVE_MODE_ALWAYS, false},
196 : {"on", ARCHIVE_MODE_ON, false},
197 : {"off", ARCHIVE_MODE_OFF, false},
198 : {"true", ARCHIVE_MODE_ON, true},
199 : {"false", ARCHIVE_MODE_OFF, true},
200 : {"yes", ARCHIVE_MODE_ON, true},
201 : {"no", ARCHIVE_MODE_OFF, true},
202 : {"1", ARCHIVE_MODE_ON, true},
203 : {"0", ARCHIVE_MODE_OFF, true},
204 : {NULL, 0, false}
205 : };
206 :
207 : /*
208 : * Statistics for current checkpoint are collected in this global struct.
209 : * Because only the checkpointer or a stand-alone backend can perform
210 : * checkpoints, this will be unused in normal backends.
211 : */
212 : CheckpointStatsData CheckpointStats;
213 :
214 : /*
215 : * During recovery, lastFullPageWrites keeps track of full_page_writes that
216 : * the replayed WAL records indicate. It's initialized with full_page_writes
217 : * that the recovery starting checkpoint record indicates, and then updated
218 : * each time XLOG_FPW_CHANGE record is replayed.
219 : */
220 : static bool lastFullPageWrites;
221 :
222 : /*
223 : * Local copy of the state tracked by SharedRecoveryState in shared memory,
224 : * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
225 : * means "not known, need to check the shared state".
226 : */
227 : static bool LocalRecoveryInProgress = true;
228 :
229 : /*
230 : * Local state for XLogInsertAllowed():
231 : * 1: unconditionally allowed to insert XLOG
232 : * 0: unconditionally not allowed to insert XLOG
233 : * -1: must check RecoveryInProgress(); disallow until it is false
234 : * Most processes start with -1 and transition to 1 after seeing that recovery
235 : * is not in progress. But we can also force the value for special cases.
236 : * The coding in XLogInsertAllowed() depends on the first two of these states
237 : * being numerically the same as bool true and false.
238 : */
239 : static int LocalXLogInsertAllowed = -1;
240 :
241 : /*
242 : * ProcLastRecPtr points to the start of the last XLOG record inserted by the
243 : * current backend. It is updated for all inserts. XactLastRecEnd points to
244 : * end+1 of the last record, and is reset when we end a top-level transaction,
245 : * or start a new one; so it can be used to tell if the current transaction has
246 : * created any XLOG records.
247 : *
248 : * While in parallel mode, this may not be fully up to date. When committing,
249 : * a transaction can assume this covers all xlog records written either by the
250 : * user backend or by any parallel worker which was present at any point during
251 : * the transaction. But when aborting, or when still in parallel mode, other
252 : * parallel backends may have written WAL records at later LSNs than the value
253 : * stored here. The parallel leader advances its own copy, when necessary,
254 : * in WaitForParallelWorkersToFinish.
255 : */
256 : XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
257 : XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
258 : XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
259 :
260 : /*
261 : * RedoRecPtr is this backend's local copy of the REDO record pointer
262 : * (which is almost but not quite the same as a pointer to the most recent
263 : * CHECKPOINT record). We update this from the shared-memory copy,
264 : * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
265 : * hold an insertion lock). See XLogInsertRecord for details. We are also
266 : * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
267 : * see GetRedoRecPtr.
268 : *
269 : * NB: Code that uses this variable must be prepared not only for the
270 : * possibility that it may be arbitrarily out of date, but also for the
271 : * possibility that it might be set to InvalidXLogRecPtr. We used to
272 : * initialize it as a side effect of the first call to RecoveryInProgress(),
273 : * which meant that most code that might use it could assume that it had a
274 : * real if perhaps stale value. That's no longer the case.
275 : */
276 : static XLogRecPtr RedoRecPtr;
277 :
278 : /*
279 : * doPageWrites is this backend's local copy of (fullPageWrites ||
280 : * runningBackups > 0). It is used together with RedoRecPtr to decide whether
281 : * a full-page image of a page need to be taken.
282 : *
283 : * NB: Initially this is false, and there's no guarantee that it will be
284 : * initialized to any other value before it is first used. Any code that
285 : * makes use of it must recheck the value after obtaining a WALInsertLock,
286 : * and respond appropriately if it turns out that the previous value wasn't
287 : * accurate.
288 : */
289 : static bool doPageWrites;
290 :
291 : /*----------
292 : * Shared-memory data structures for XLOG control
293 : *
294 : * LogwrtRqst indicates a byte position that we need to write and/or fsync
295 : * the log up to (all records before that point must be written or fsynced).
296 : * LogwrtResult indicates the byte positions we have already written/fsynced.
297 : * These structs are identical but are declared separately to indicate their
298 : * slightly different functions.
299 : *
300 : * To read XLogCtl->LogwrtResult, you must hold either info_lck or
301 : * WALWriteLock. To update it, you need to hold both locks. The point of
302 : * this arrangement is that the value can be examined by code that already
303 : * holds WALWriteLock without needing to grab info_lck as well. In addition
304 : * to the shared variable, each backend has a private copy of LogwrtResult,
305 : * which is updated when convenient.
306 : *
307 : * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
308 : * (protected by info_lck), but we don't need to cache any copies of it.
309 : *
310 : * info_lck is only held long enough to read/update the protected variables,
311 : * so it's a plain spinlock. The other locks are held longer (potentially
312 : * over I/O operations), so we use LWLocks for them. These locks are:
313 : *
314 : * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
315 : * It is only held while initializing and changing the mapping. If the
316 : * contents of the buffer being replaced haven't been written yet, the mapping
317 : * lock is released while the write is done, and reacquired afterwards.
318 : *
319 : * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
320 : * XLogFlush).
321 : *
322 : * ControlFileLock: must be held to read/update control file or create
323 : * new log file.
324 : *
325 : *----------
326 : */
327 :
328 : typedef struct XLogwrtRqst
329 : {
330 : XLogRecPtr Write; /* last byte + 1 to write out */
331 : XLogRecPtr Flush; /* last byte + 1 to flush */
332 : } XLogwrtRqst;
333 :
334 : typedef struct XLogwrtResult
335 : {
336 : XLogRecPtr Write; /* last byte + 1 written out */
337 : XLogRecPtr Flush; /* last byte + 1 flushed */
338 : } XLogwrtResult;
339 :
340 : /*
341 : * Inserting to WAL is protected by a small fixed number of WAL insertion
342 : * locks. To insert to the WAL, you must hold one of the locks - it doesn't
343 : * matter which one. To lock out other concurrent insertions, you must hold
344 : * of them. Each WAL insertion lock consists of a lightweight lock, plus an
345 : * indicator of how far the insertion has progressed (insertingAt).
346 : *
347 : * The insertingAt values are read when a process wants to flush WAL from
348 : * the in-memory buffers to disk, to check that all the insertions to the
349 : * region the process is about to write out have finished. You could simply
350 : * wait for all currently in-progress insertions to finish, but the
351 : * insertingAt indicator allows you to ignore insertions to later in the WAL,
352 : * so that you only wait for the insertions that are modifying the buffers
353 : * you're about to write out.
354 : *
355 : * This isn't just an optimization. If all the WAL buffers are dirty, an
356 : * inserter that's holding a WAL insert lock might need to evict an old WAL
357 : * buffer, which requires flushing the WAL. If it's possible for an inserter
358 : * to block on another inserter unnecessarily, deadlock can arise when two
359 : * inserters holding a WAL insert lock wait for each other to finish their
360 : * insertion.
361 : *
362 : * Small WAL records that don't cross a page boundary never update the value,
363 : * the WAL record is just copied to the page and the lock is released. But
364 : * to avoid the deadlock-scenario explained above, the indicator is always
365 : * updated before sleeping while holding an insertion lock.
366 : *
367 : * lastImportantAt contains the LSN of the last important WAL record inserted
368 : * using a given lock. This value is used to detect if there has been
369 : * important WAL activity since the last time some action, like a checkpoint,
370 : * was performed - allowing to not repeat the action if not. The LSN is
371 : * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
372 : * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
373 : * records. Tracking the WAL activity directly in WALInsertLock has the
374 : * advantage of not needing any additional locks to update the value.
375 : */
376 : typedef struct
377 : {
378 : LWLock lock;
379 : XLogRecPtr insertingAt;
380 : XLogRecPtr lastImportantAt;
381 : } WALInsertLock;
382 :
383 : /*
384 : * All the WAL insertion locks are allocated as an array in shared memory. We
385 : * force the array stride to be a power of 2, which saves a few cycles in
386 : * indexing, but more importantly also ensures that individual slots don't
387 : * cross cache line boundaries. (Of course, we have to also ensure that the
388 : * array start address is suitably aligned.)
389 : */
390 : typedef union WALInsertLockPadded
391 : {
392 : WALInsertLock l;
393 : char pad[PG_CACHE_LINE_SIZE];
394 : } WALInsertLockPadded;
395 :
396 : /*
397 : * Session status of running backup, used for sanity checks in SQL-callable
398 : * functions to start and stop backups.
399 : */
400 : static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
401 :
402 : /*
403 : * Shared state data for WAL insertion.
404 : */
405 : typedef struct XLogCtlInsert
406 : {
407 : slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
408 :
409 : /*
410 : * CurrBytePos is the end of reserved WAL. The next record will be
411 : * inserted at that position. PrevBytePos is the start position of the
412 : * previously inserted (or rather, reserved) record - it is copied to the
413 : * prev-link of the next record. These are stored as "usable byte
414 : * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
415 : */
416 : uint64 CurrBytePos;
417 : uint64 PrevBytePos;
418 :
419 : /*
420 : * Make sure the above heavily-contended spinlock and byte positions are
421 : * on their own cache line. In particular, the RedoRecPtr and full page
422 : * write variables below should be on a different cache line. They are
423 : * read on every WAL insertion, but updated rarely, and we don't want
424 : * those reads to steal the cache line containing Curr/PrevBytePos.
425 : */
426 : char pad[PG_CACHE_LINE_SIZE];
427 :
428 : /*
429 : * fullPageWrites is the authoritative value used by all backends to
430 : * determine whether to write full-page image to WAL. This shared value,
431 : * instead of the process-local fullPageWrites, is required because, when
432 : * full_page_writes is changed by SIGHUP, we must WAL-log it before it
433 : * actually affects WAL-logging by backends. Checkpointer sets at startup
434 : * or after SIGHUP.
435 : *
436 : * To read these fields, you must hold an insertion lock. To modify them,
437 : * you must hold ALL the locks.
438 : */
439 : XLogRecPtr RedoRecPtr; /* current redo point for insertions */
440 : bool fullPageWrites;
441 :
442 : /*
443 : * runningBackups is a counter indicating the number of backups currently
444 : * in progress. lastBackupStart is the latest checkpoint redo location
445 : * used as a starting point for an online backup.
446 : */
447 : int runningBackups;
448 : XLogRecPtr lastBackupStart;
449 :
450 : /*
451 : * WAL insertion locks.
452 : */
453 : WALInsertLockPadded *WALInsertLocks;
454 : } XLogCtlInsert;
455 :
456 : /*
457 : * Total shared-memory state for XLOG.
458 : */
459 : typedef struct XLogCtlData
460 : {
461 : XLogCtlInsert Insert;
462 :
463 : /* Protected by info_lck: */
464 : XLogwrtRqst LogwrtRqst;
465 : XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
466 : FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */
467 : XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
468 : XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
469 :
470 : XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
471 :
472 : /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
473 : XLogRecPtr unloggedLSN;
474 : slock_t ulsn_lck;
475 :
476 : /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
477 : pg_time_t lastSegSwitchTime;
478 : XLogRecPtr lastSegSwitchLSN;
479 :
480 : /*
481 : * Protected by info_lck and WALWriteLock (you must hold either lock to
482 : * read it, but both to update)
483 : */
484 : XLogwrtResult LogwrtResult;
485 :
486 : /*
487 : * Latest initialized page in the cache (last byte position + 1).
488 : *
489 : * To change the identity of a buffer (and InitializedUpTo), you need to
490 : * hold WALBufMappingLock. To change the identity of a buffer that's
491 : * still dirty, the old page needs to be written out first, and for that
492 : * you need WALWriteLock, and you need to ensure that there are no
493 : * in-progress insertions to the page by calling
494 : * WaitXLogInsertionsToFinish().
495 : */
496 : XLogRecPtr InitializedUpTo;
497 :
498 : /*
499 : * These values do not change after startup, although the pointed-to pages
500 : * and xlblocks values certainly do. xlblocks values are protected by
501 : * WALBufMappingLock.
502 : */
503 : char *pages; /* buffers for unwritten XLOG pages */
504 : XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
505 : int XLogCacheBlck; /* highest allocated xlog buffer index */
506 :
507 : /*
508 : * InsertTimeLineID is the timeline into which new WAL is being inserted
509 : * and flushed. It is zero during recovery, and does not change once set.
510 : *
511 : * If we create a new timeline when the system was started up,
512 : * PrevTimeLineID is the old timeline's ID that we forked off from.
513 : * Otherwise it's equal to InsertTimeLineID.
514 : */
515 : TimeLineID InsertTimeLineID;
516 : TimeLineID PrevTimeLineID;
517 :
518 : /*
519 : * SharedRecoveryState indicates if we're still in crash or archive
520 : * recovery. Protected by info_lck.
521 : */
522 : RecoveryState SharedRecoveryState;
523 :
524 : /*
525 : * InstallXLogFileSegmentActive indicates whether the checkpointer should
526 : * arrange for future segments by recycling and/or PreallocXlogFiles().
527 : * Protected by ControlFileLock. Only the startup process changes it. If
528 : * true, anyone can use InstallXLogFileSegment(). If false, the startup
529 : * process owns the exclusive right to install segments, by reading from
530 : * the archive and possibly replacing existing files.
531 : */
532 : bool InstallXLogFileSegmentActive;
533 :
534 : /*
535 : * WalWriterSleeping indicates whether the WAL writer is currently in
536 : * low-power mode (and hence should be nudged if an async commit occurs).
537 : * Protected by info_lck.
538 : */
539 : bool WalWriterSleeping;
540 :
541 : /*
542 : * During recovery, we keep a copy of the latest checkpoint record here.
543 : * lastCheckPointRecPtr points to start of checkpoint record and
544 : * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
545 : * checkpointer when it wants to create a restartpoint.
546 : *
547 : * Protected by info_lck.
548 : */
549 : XLogRecPtr lastCheckPointRecPtr;
550 : XLogRecPtr lastCheckPointEndPtr;
551 : CheckPoint lastCheckPoint;
552 :
553 : /*
554 : * lastFpwDisableRecPtr points to the start of the last replayed
555 : * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
556 : */
557 : XLogRecPtr lastFpwDisableRecPtr;
558 :
559 : slock_t info_lck; /* locks shared variables shown above */
560 : } XLogCtlData;
561 :
562 : static XLogCtlData *XLogCtl = NULL;
563 :
564 : /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
565 : static WALInsertLockPadded *WALInsertLocks = NULL;
566 :
567 : /*
568 : * We maintain an image of pg_control in shared memory.
569 : */
570 : static ControlFileData *ControlFile = NULL;
571 :
572 : /*
573 : * Calculate the amount of space left on the page after 'endptr'. Beware
574 : * multiple evaluation!
575 : */
576 : #define INSERT_FREESPACE(endptr) \
577 : (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
578 :
579 : /* Macro to advance to next buffer index. */
580 : #define NextBufIdx(idx) \
581 : (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
582 :
583 : /*
584 : * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
585 : * would hold if it was in cache, the page containing 'recptr'.
586 : */
587 : #define XLogRecPtrToBufIdx(recptr) \
588 : (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
589 :
590 : /*
591 : * These are the number of bytes in a WAL page usable for WAL data.
592 : */
593 : #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
594 :
595 : /*
596 : * Convert values of GUCs measured in megabytes to equiv. segment count.
597 : * Rounds down.
598 : */
599 : #define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
600 :
601 : /* The number of bytes in a WAL segment usable for WAL data. */
602 : static int UsableBytesInSegment;
603 :
604 : /*
605 : * Private, possibly out-of-date copy of shared LogwrtResult.
606 : * See discussion above.
607 : */
608 : static XLogwrtResult LogwrtResult = {0, 0};
609 :
610 : /*
611 : * openLogFile is -1 or a kernel FD for an open log file segment.
612 : * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
613 : * These variables are only used to write the XLOG, and so will normally refer
614 : * to the active segment.
615 : *
616 : * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
617 : */
618 : static int openLogFile = -1;
619 : static XLogSegNo openLogSegNo = 0;
620 : static TimeLineID openLogTLI = 0;
621 :
622 : /*
623 : * Local copies of equivalent fields in the control file. When running
624 : * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
625 : * expect to replay all the WAL available, and updateMinRecoveryPoint is
626 : * switched to false to prevent any updates while replaying records.
627 : * Those values are kept consistent as long as crash recovery runs.
628 : */
629 : static XLogRecPtr LocalMinRecoveryPoint;
630 : static TimeLineID LocalMinRecoveryPointTLI;
631 : static bool updateMinRecoveryPoint = true;
632 :
633 : /* For WALInsertLockAcquire/Release functions */
634 : static int MyLockNo = 0;
635 : static bool holdingAllLocks = false;
636 :
637 : #ifdef WAL_DEBUG
638 : static MemoryContext walDebugCxt = NULL;
639 : #endif
640 :
641 : static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
642 : XLogRecPtr EndOfLog,
643 : TimeLineID newTLI);
644 : static void CheckRequiredParameterValues(void);
645 : static void XLogReportParameters(void);
646 : static int LocalSetXLogInsertAllowed(void);
647 : static void CreateEndOfRecoveryRecord(void);
648 : static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
649 : XLogRecPtr pagePtr,
650 : TimeLineID newTLI);
651 : static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
652 : static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
653 : static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
654 :
655 : static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
656 : bool opportunistic);
657 : static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
658 : static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
659 : bool find_free, XLogSegNo max_segno,
660 : TimeLineID tli);
661 : static void XLogFileClose(void);
662 : static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
663 : static void RemoveTempXlogFiles(void);
664 : static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
665 : XLogRecPtr endptr, TimeLineID insertTLI);
666 : static void RemoveXlogFile(const struct dirent *segment_de,
667 : XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
668 : TimeLineID insertTLI);
669 : static void UpdateLastRemovedPtr(char *filename);
670 : static void ValidateXLOGDirectoryStructure(void);
671 : static void CleanupBackupHistory(void);
672 : static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
673 : static bool PerformRecoveryXLogAction(void);
674 : static void InitControlFile(uint64 sysidentifier);
675 : static void WriteControlFile(void);
676 : static void ReadControlFile(void);
677 : static void UpdateControlFile(void);
678 : static char *str_time(pg_time_t tnow);
679 :
680 : static int get_sync_bit(int method);
681 :
682 : static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
683 : XLogRecData *rdata,
684 : XLogRecPtr StartPos, XLogRecPtr EndPos,
685 : TimeLineID tli);
686 : static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
687 : XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
688 : static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
689 : XLogRecPtr *PrevPtr);
690 : static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
691 : static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
692 : static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
693 : static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
694 : static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
695 :
696 : static void WALInsertLockAcquire(void);
697 : static void WALInsertLockAcquireExclusive(void);
698 : static void WALInsertLockRelease(void);
699 : static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
700 :
701 : /*
702 : * Insert an XLOG record represented by an already-constructed chain of data
703 : * chunks. This is a low-level routine; to construct the WAL record header
704 : * and data, use the higher-level routines in xloginsert.c.
705 : *
706 : * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
707 : * WAL record applies to, that were not included in the record as full page
708 : * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
709 : * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
710 : * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
711 : * record is always inserted.
712 : *
713 : * 'flags' gives more in-depth control on the record being inserted. See
714 : * XLogSetRecordFlags() for details.
715 : *
716 : * 'topxid_included' tells whether the top-transaction id is logged along with
717 : * current subtransaction. See XLogRecordAssemble().
718 : *
719 : * The first XLogRecData in the chain must be for the record header, and its
720 : * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
721 : * xl_crc fields in the header, the rest of the header must already be filled
722 : * by the caller.
723 : *
724 : * Returns XLOG pointer to end of record (beginning of next record).
725 : * This can be used as LSN for data pages affected by the logged action.
726 : * (LSN is the XLOG point up to which the XLOG must be flushed to disk
727 : * before the data page can be written out. This implements the basic
728 : * WAL rule "write the log before the data".)
729 : */
730 : XLogRecPtr
2299 andres 731 GIC 19404394 : XLogInsertRecord(XLogRecData *rdata,
732 : XLogRecPtr fpw_lsn,
733 : uint8 flags,
523 akapila 734 ECB : int num_fpi,
735 : bool topxid_included)
736 : {
8053 bruce 737 GIC 19404394 : XLogCtlInsert *Insert = &XLogCtl->Insert;
738 : pg_crc32c rdata_crc;
739 : bool inserted;
3076 heikki.linnakangas 740 CBC 19404394 : XLogRecord *rechdr = (XLogRecord *) rdata->data;
2347 tgl 741 GIC 19404394 : uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
3076 heikki.linnakangas 742 19404394 : bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
2347 tgl 743 ECB : info == XLOG_SWITCH);
3562 heikki.linnakangas 744 : XLogRecPtr StartPos;
745 : XLogRecPtr EndPos;
1669 akapila 746 GIC 19404394 : bool prevDoPageWrites = doPageWrites;
747 : TimeLineID insertTLI;
748 :
3062 heikki.linnakangas 749 ECB : /* we assume that all of the record header is in the first chunk */
3062 heikki.linnakangas 750 GIC 19404394 : Assert(rdata->len >= SizeOfXLogRecord);
751 :
752 : /* cross-check on whether we should be here or not */
5035 tgl 753 CBC 19404394 : if (!XLogInsertAllowed())
5035 tgl 754 UIC 0 : elog(ERROR, "cannot make new WAL entries during recovery");
755 :
520 rhaas 756 ECB : /*
515 rhaas 757 EUB : * Given that we're not in recovery, InsertTimeLineID is set and can't
758 : * change, so we can read it without a lock.
759 : */
515 rhaas 760 GIC 19404394 : insertTLI = XLogCtl->InsertTimeLineID;
761 :
762 : /*----------
3562 heikki.linnakangas 763 ECB : *
764 : * We have now done all the preparatory work we can without holding a
765 : * lock or modifying shared state. From here on, inserting the new WAL
766 : * record to the shared WAL buffer cache is a two-step process:
767 : *
768 : * 1. Reserve the right amount of space from the WAL. The current head of
769 : * reserved space is kept in Insert->CurrBytePos, and is protected by
770 : * insertpos_lck.
771 : *
772 : * 2. Copy the record to the reserved WAL space. This involves finding the
773 : * correct WAL buffer containing the reserved space, and copying the
774 : * record in place. This can be done concurrently in multiple processes.
775 : *
776 : * To keep track of which insertions are still in-progress, each concurrent
777 : * inserter acquires an insertion lock. In addition to just indicating that
778 : * an insertion is in progress, the lock tells others how far the inserter
779 : * has progressed. There is a small fixed number of insertion locks,
780 : * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
781 : * boundary, it updates the value stored in the lock to the how far it has
782 : * inserted, to allow the previous buffer to be flushed.
783 : *
784 : * Holding onto an insertion lock also protects RedoRecPtr and
785 : * fullPageWrites from changing until the insertion is finished.
786 : *
787 : * Step 2 can usually be done completely in parallel. If the required WAL
788 : * page is not initialized yet, you have to grab WALBufMappingLock to
789 : * initialize it, but the WAL writer tries to do that ahead of insertions
790 : * to avoid that from happening in the critical path.
791 : *
792 : *----------
793 : */
4106 heikki.linnakangas 794 GIC 19404394 : START_CRIT_SECTION();
3306 795 19404394 : if (isLogSwitch)
796 442 : WALInsertLockAcquireExclusive();
3306 heikki.linnakangas 797 ECB : else
3306 heikki.linnakangas 798 CBC 19403952 : WALInsertLockAcquire();
4106 heikki.linnakangas 799 ECB :
800 : /*
1669 akapila 801 : * Check to see if my copy of RedoRecPtr is out of date. If so, may have
802 : * to go back and have the caller recompute everything. This can only
803 : * happen just after a checkpoint, so it's better to be slow in this case
804 : * and fast otherwise.
805 : *
806 : * Also check to see if fullPageWrites was just turned on or there's a
807 : * running backup (which forces full-page writes); if we weren't already
808 : * doing full-page writes then go back and recompute.
809 : *
810 : * If we aren't doing full-page writes then RedoRecPtr doesn't actually
811 : * affect the contents of the XLOG record, so we'll update our local copy
812 : * but not force a recomputation. (If doPageWrites was just turned off,
813 : * we could recompute the record without full pages, but we choose not to
814 : * bother.)
815 : */
3754 alvherre 816 GIC 19404394 : if (RedoRecPtr != Insert->RedoRecPtr)
817 : {
818 4522 : Assert(RedoRecPtr < Insert->RedoRecPtr);
4106 heikki.linnakangas 819 CBC 4522 : RedoRecPtr = Insert->RedoRecPtr;
820 : }
172 alvherre 821 GNC 19404394 : doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
4106 heikki.linnakangas 822 ECB :
1669 akapila 823 GIC 19404394 : if (doPageWrites &&
1669 akapila 824 CBC 19214991 : (!prevDoPageWrites ||
1669 akapila 825 GIC 18134561 : (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
4106 heikki.linnakangas 826 ECB : {
3076 827 : /*
2878 bruce 828 : * Oops, some buffer now needs to be backed up that the caller didn't
829 : * back up. Start over.
830 : */
3306 heikki.linnakangas 831 GIC 4992 : WALInsertLockRelease();
4106 832 4992 : END_CRIT_SECTION();
3076 833 4992 : return InvalidXLogRecPtr;
4106 heikki.linnakangas 834 ECB : }
835 :
6997 tgl 836 : /*
837 : * Reserve space for the record in the WAL. This also sets the xl_prev
838 : * pointer.
839 : */
3562 heikki.linnakangas 840 GIC 19399402 : if (isLogSwitch)
841 300 : inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
842 : else
3562 heikki.linnakangas 843 ECB : {
3076 heikki.linnakangas 844 CBC 19399102 : ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
845 : &rechdr->xl_prev);
3562 heikki.linnakangas 846 GIC 19399102 : inserted = true;
3562 heikki.linnakangas 847 ECB : }
848 :
3562 heikki.linnakangas 849 CBC 19399402 : if (inserted)
850 : {
851 : /*
3062 heikki.linnakangas 852 ECB : * Now that xl_prev has been filled in, calculate CRC of the record
853 : * header.
854 : */
3062 heikki.linnakangas 855 GIC 19399350 : rdata_crc = rechdr->xl_crc;
856 19399350 : COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
3078 857 19399350 : FIN_CRC32C(rdata_crc);
3562 heikki.linnakangas 858 CBC 19399350 : rechdr->xl_crc = rdata_crc;
3562 heikki.linnakangas 859 ECB :
860 : /*
861 : * All the record data, including the header, is now ready to be
862 : * inserted. Copy the record in the space reserved.
863 : */
3076 heikki.linnakangas 864 GIC 19399350 : CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
865 : StartPos, EndPos, insertTLI);
866 :
2299 andres 867 ECB : /*
868 : * Unless record is flagged as not important, update LSN of last
869 : * important record in the current slot. When holding all locks, just
870 : * update the first one.
871 : */
2299 andres 872 GIC 19399350 : if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
873 : {
2153 bruce 874 19259839 : int lockno = holdingAllLocks ? 0 : MyLockNo;
2299 andres 875 ECB :
2299 andres 876 GIC 19259839 : WALInsertLocks[lockno].l.lastImportantAt = StartPos;
2299 andres 877 ECB : }
878 : }
3562 heikki.linnakangas 879 : else
880 : {
881 : /*
882 : * This was an xlog-switch record, but the current insert location was
883 : * already exactly at the beginning of a segment, so there was no need
884 : * to do anything.
885 : */
886 : }
887 :
888 : /*
889 : * Done! Let others know that we're finished.
890 : */
3306 heikki.linnakangas 891 GIC 19399402 : WALInsertLockRelease();
892 :
3562 893 19399402 : END_CRIT_SECTION();
3562 heikki.linnakangas 894 ECB :
523 akapila 895 GIC 19399402 : MarkCurrentTransactionIdLoggedIfAny();
523 akapila 896 ECB :
897 : /*
898 : * Mark top transaction id is logged (if needed) so that we should not try
899 : * to log it again with the next WAL record in the current subtransaction.
900 : */
523 akapila 901 GIC 19399402 : if (topxid_included)
902 234 : MarkSubxactTopXidLogged();
903 :
3562 heikki.linnakangas 904 ECB : /*
582 alvherre 905 : * Update shared LogwrtRqst.Write, if we crossed page boundary.
906 : */
3562 heikki.linnakangas 907 GIC 19399402 : if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
908 : {
3121 andres 909 444287 : SpinLockAcquire(&XLogCtl->info_lck);
582 alvherre 910 ECB : /* advance global request to include new block(s) */
3121 andres 911 GIC 444287 : if (XLogCtl->LogwrtRqst.Write < EndPos)
3121 andres 912 CBC 444138 : XLogCtl->LogwrtRqst.Write = EndPos;
913 : /* update local result copy while I have the chance */
914 444287 : LogwrtResult = XLogCtl->LogwrtResult;
915 444287 : SpinLockRelease(&XLogCtl->info_lck);
916 : }
3562 heikki.linnakangas 917 ECB :
918 : /*
919 : * If this was an XLOG_SWITCH record, flush the record and the empty
920 : * padding space that fills the rest of the segment, and perform
921 : * end-of-segment actions (eg, notifying archiver).
922 : */
3562 heikki.linnakangas 923 GIC 19399402 : if (isLogSwitch)
924 : {
925 : TRACE_POSTGRESQL_WAL_SWITCH();
3562 heikki.linnakangas 926 CBC 300 : XLogFlush(EndPos);
927 :
928 : /*
3562 heikki.linnakangas 929 ECB : * Even though we reserved the rest of the segment for us, which is
930 : * reflected in EndPos, we return a pointer to just the end of the
931 : * xlog-switch record.
932 : */
3562 heikki.linnakangas 933 GIC 300 : if (inserted)
934 : {
935 248 : EndPos = StartPos + SizeOfXLogRecord;
3562 heikki.linnakangas 936 CBC 248 : if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
937 : {
2028 andres 938 LBC 0 : uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
2028 andres 939 ECB :
2028 andres 940 UIC 0 : if (offset == EndPos % XLOG_BLCKSZ)
3562 heikki.linnakangas 941 UBC 0 : EndPos += SizeOfXLogLongPHD;
942 : else
943 0 : EndPos += SizeOfXLogShortPHD;
3562 heikki.linnakangas 944 EUB : }
945 : }
946 : }
947 :
948 : #ifdef WAL_DEBUG
949 : if (XLOG_DEBUG)
950 : {
951 : static XLogReaderState *debug_reader = NULL;
952 : XLogRecord *record;
953 : DecodedXLogRecord *decoded;
954 : StringInfoData buf;
955 : StringInfoData recordBuf;
956 : char *errormsg = NULL;
957 : MemoryContext oldCxt;
958 :
959 : oldCxt = MemoryContextSwitchTo(walDebugCxt);
960 :
961 : initStringInfo(&buf);
962 : appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
963 :
964 : /*
965 : * We have to piece together the WAL record data from the XLogRecData
966 : * entries, so that we can pass it to the rm_desc function as one
967 : * contiguous chunk.
968 : */
969 : initStringInfo(&recordBuf);
970 : for (; rdata != NULL; rdata = rdata->next)
971 : appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
972 :
973 : /* We also need temporary space to decode the record. */
974 : record = (XLogRecord *) recordBuf.data;
975 : decoded = (DecodedXLogRecord *)
976 : palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
977 :
978 : if (!debug_reader)
979 : debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
980 : XL_ROUTINE(), NULL);
981 :
982 : if (!debug_reader)
983 : {
984 : appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
985 : }
986 : else if (!DecodeXLogRecord(debug_reader,
987 : decoded,
988 : record,
989 : EndPos,
990 : &errormsg))
991 : {
992 : appendStringInfo(&buf, "error decoding record: %s",
993 : errormsg ? errormsg : "no error message");
994 : }
995 : else
996 : {
997 : appendStringInfoString(&buf, " - ");
998 :
999 : debug_reader->record = decoded;
1000 : xlog_outdesc(&buf, debug_reader);
1001 : debug_reader->record = NULL;
1002 : }
1003 : elog(LOG, "%s", buf.data);
1004 :
1005 : pfree(decoded);
1006 : pfree(buf.data);
1007 : pfree(recordBuf.data);
1008 : MemoryContextSwitchTo(oldCxt);
1009 : }
1010 : #endif
1011 :
1012 : /*
1013 : * Update our global variables
1014 : */
3562 heikki.linnakangas 1015 GIC 19399402 : ProcLastRecPtr = StartPos;
1016 19399402 : XactLastRecEnd = EndPos;
1017 :
1100 akapila 1018 ECB : /* Report WAL traffic to the instrumentation. */
1100 akapila 1019 CBC 19399402 : if (inserted)
1020 : {
1100 akapila 1021 GIC 19399350 : pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1100 akapila 1022 CBC 19399350 : pgWalUsage.wal_records++;
1069 akapila 1023 GIC 19399350 : pgWalUsage.wal_fpi += num_fpi;
1100 akapila 1024 ECB : }
1025 :
3562 heikki.linnakangas 1026 CBC 19399402 : return EndPos;
1027 : }
1028 :
3562 heikki.linnakangas 1029 ECB : /*
1030 : * Reserves the right amount of space for a record of given size from the WAL.
1031 : * *StartPos is set to the beginning of the reserved section, *EndPos to
1032 : * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1033 : * used to set the xl_prev of this record.
1034 : *
1035 : * This is the performance critical part of XLogInsert that must be serialized
1036 : * across backends. The rest can happen mostly in parallel. Try to keep this
1037 : * section as short as possible, insertpos_lck can be heavily contended on a
1038 : * busy system.
1039 : *
1040 : * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1041 : * where we actually copy the record to the reserved space.
1042 : */
1043 : static void
3562 heikki.linnakangas 1044 GIC 19399102 : ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1045 : XLogRecPtr *PrevPtr)
1046 : {
3121 andres 1047 CBC 19399102 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1048 : uint64 startbytepos;
1049 : uint64 endbytepos;
3562 heikki.linnakangas 1050 ECB : uint64 prevbytepos;
1051 :
3562 heikki.linnakangas 1052 GIC 19399102 : size = MAXALIGN(size);
1053 :
1054 : /* All (non xlog-switch) records should contain data. */
3562 heikki.linnakangas 1055 CBC 19399102 : Assert(size > SizeOfXLogRecord);
1056 :
1057 : /*
3562 heikki.linnakangas 1058 ECB : * The duration the spinlock needs to be held is minimized by minimizing
1059 : * the calculations that have to be done while holding the lock. The
1060 : * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1061 : * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1062 : * page headers. The mapping between "usable" byte positions and physical
1063 : * positions (XLogRecPtrs) can be done outside the locked region, and
1064 : * because the usable byte position doesn't include any headers, reserving
1065 : * X bytes from WAL is almost as simple as "CurrBytePos += X".
1066 : */
3562 heikki.linnakangas 1067 GIC 19399102 : SpinLockAcquire(&Insert->insertpos_lck);
1068 :
1069 19399102 : startbytepos = Insert->CurrBytePos;
3562 heikki.linnakangas 1070 CBC 19399102 : endbytepos = startbytepos + size;
3562 heikki.linnakangas 1071 GIC 19399102 : prevbytepos = Insert->PrevBytePos;
3562 heikki.linnakangas 1072 CBC 19399102 : Insert->CurrBytePos = endbytepos;
1073 19399102 : Insert->PrevBytePos = startbytepos;
3562 heikki.linnakangas 1074 ECB :
3562 heikki.linnakangas 1075 CBC 19399102 : SpinLockRelease(&Insert->insertpos_lck);
3562 heikki.linnakangas 1076 ECB :
3562 heikki.linnakangas 1077 GIC 19399102 : *StartPos = XLogBytePosToRecPtr(startbytepos);
3562 heikki.linnakangas 1078 CBC 19399102 : *EndPos = XLogBytePosToEndRecPtr(endbytepos);
3562 heikki.linnakangas 1079 GIC 19399102 : *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
3562 heikki.linnakangas 1080 ECB :
1081 : /*
1082 : * Check that the conversions between "usable byte positions" and
1083 : * XLogRecPtrs work consistently in both directions.
1084 : */
3562 heikki.linnakangas 1085 GIC 19399102 : Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1086 19399102 : Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1087 19399102 : Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
3562 heikki.linnakangas 1088 CBC 19399102 : }
3562 heikki.linnakangas 1089 ECB :
1090 : /*
1091 : * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1092 : *
1093 : * A log-switch record is handled slightly differently. The rest of the
1094 : * segment will be reserved for this insertion, as indicated by the returned
1095 : * *EndPos value. However, if we are already at the beginning of the current
1096 : * segment, *StartPos and *EndPos are set to the current location without
1097 : * reserving any space, and the function returns false.
1098 : */
1099 : static bool
3562 heikki.linnakangas 1100 GIC 300 : ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1101 : {
3121 andres 1102 300 : XLogCtlInsert *Insert = &XLogCtl->Insert;
3562 heikki.linnakangas 1103 ECB : uint64 startbytepos;
1104 : uint64 endbytepos;
1105 : uint64 prevbytepos;
3062 heikki.linnakangas 1106 GIC 300 : uint32 size = MAXALIGN(SizeOfXLogRecord);
1107 : XLogRecPtr ptr;
1108 : uint32 segleft;
3562 heikki.linnakangas 1109 ECB :
1110 : /*
1111 : * These calculations are a bit heavy-weight to be done while holding a
1112 : * spinlock, but since we're holding all the WAL insertion locks, there
1113 : * are no other inserters competing for it. GetXLogInsertRecPtr() does
1114 : * compete for it, but that's not called very frequently.
1115 : */
3562 heikki.linnakangas 1116 GIC 300 : SpinLockAcquire(&Insert->insertpos_lck);
1117 :
1118 300 : startbytepos = Insert->CurrBytePos;
3562 heikki.linnakangas 1119 ECB :
3562 heikki.linnakangas 1120 GIC 300 : ptr = XLogBytePosToEndRecPtr(startbytepos);
2028 andres 1121 CBC 300 : if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1122 : {
3562 heikki.linnakangas 1123 52 : SpinLockRelease(&Insert->insertpos_lck);
1124 52 : *EndPos = *StartPos = ptr;
3562 heikki.linnakangas 1125 GIC 52 : return false;
3562 heikki.linnakangas 1126 ECB : }
1127 :
3562 heikki.linnakangas 1128 CBC 248 : endbytepos = startbytepos + size;
3562 heikki.linnakangas 1129 GIC 248 : prevbytepos = Insert->PrevBytePos;
1130 :
3562 heikki.linnakangas 1131 CBC 248 : *StartPos = XLogBytePosToRecPtr(startbytepos);
1132 248 : *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1133 :
2028 andres 1134 248 : segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1135 248 : if (segleft != wal_segment_size)
1136 : {
3562 heikki.linnakangas 1137 ECB : /* consume the rest of the segment */
3562 heikki.linnakangas 1138 CBC 248 : *EndPos += segleft;
3562 heikki.linnakangas 1139 GIC 248 : endbytepos = XLogRecPtrToBytePos(*EndPos);
1140 : }
3562 heikki.linnakangas 1141 CBC 248 : Insert->CurrBytePos = endbytepos;
1142 248 : Insert->PrevBytePos = startbytepos;
1143 :
1144 248 : SpinLockRelease(&Insert->insertpos_lck);
3562 heikki.linnakangas 1145 ECB :
3562 heikki.linnakangas 1146 GIC 248 : *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
3562 heikki.linnakangas 1147 ECB :
2028 andres 1148 GIC 248 : Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
3562 heikki.linnakangas 1149 CBC 248 : Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
3562 heikki.linnakangas 1150 GIC 248 : Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
3562 heikki.linnakangas 1151 CBC 248 : Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
3562 heikki.linnakangas 1152 ECB :
3562 heikki.linnakangas 1153 CBC 248 : return true;
3562 heikki.linnakangas 1154 ECB : }
1155 :
1156 : /*
1157 : * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1158 : * area in the WAL.
1159 : */
1160 : static void
3562 heikki.linnakangas 1161 GIC 19399350 : CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1162 : XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
1163 : {
3562 heikki.linnakangas 1164 ECB : char *currpos;
1165 : int freespace;
1166 : int written;
1167 : XLogRecPtr CurrPos;
1168 : XLogPageHeader pagehdr;
1169 :
1170 : /*
1171 : * Get a pointer to the right place in the right WAL buffer to start
1172 : * inserting to.
1173 : */
3562 heikki.linnakangas 1174 GIC 19399350 : CurrPos = StartPos;
520 rhaas 1175 19399350 : currpos = GetXLogBuffer(CurrPos, tli);
3562 heikki.linnakangas 1176 19399350 : freespace = INSERT_FREESPACE(CurrPos);
3562 heikki.linnakangas 1177 ECB :
1178 : /*
1179 : * there should be enough space for at least the first field (xl_tot_len)
1180 : * on this page.
1181 : */
3562 heikki.linnakangas 1182 GIC 19399350 : Assert(freespace >= sizeof(uint32));
1183 :
1184 : /* Copy record data */
3562 heikki.linnakangas 1185 CBC 19399350 : written = 0;
3562 heikki.linnakangas 1186 GIC 85337519 : while (rdata != NULL)
1187 : {
3562 heikki.linnakangas 1188 CBC 65938169 : char *rdata_data = rdata->data;
1189 65938169 : int rdata_len = rdata->len;
1190 :
1191 66429163 : while (rdata_len > freespace)
3562 heikki.linnakangas 1192 ECB : {
1193 : /*
1194 : * Write what fits on this page, and continue on the next page.
1195 : */
3562 heikki.linnakangas 1196 GIC 490994 : Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1197 490994 : memcpy(currpos, rdata_data, freespace);
1198 490994 : rdata_data += freespace;
3562 heikki.linnakangas 1199 CBC 490994 : rdata_len -= freespace;
1200 490994 : written += freespace;
1201 490994 : CurrPos += freespace;
3562 heikki.linnakangas 1202 ECB :
1203 : /*
1204 : * Get pointer to beginning of next page, and set the xlp_rem_len
1205 : * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1206 : *
1207 : * It's safe to set the contrecord flag and xlp_rem_len without a
1208 : * lock on the page. All the other flags were already set when the
1209 : * page was initialized, in AdvanceXLInsertBuffer, and we're the
1210 : * only backend that needs to set the contrecord flag.
1211 : */
520 rhaas 1212 GIC 490994 : currpos = GetXLogBuffer(CurrPos, tli);
3562 heikki.linnakangas 1213 490994 : pagehdr = (XLogPageHeader) currpos;
1214 490994 : pagehdr->xlp_rem_len = write_len - written;
3562 heikki.linnakangas 1215 CBC 490994 : pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
3562 heikki.linnakangas 1216 ECB :
1217 : /* skip over the page header */
2028 andres 1218 CBC 490994 : if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1219 : {
3562 heikki.linnakangas 1220 GIC 426 : CurrPos += SizeOfXLogLongPHD;
3562 heikki.linnakangas 1221 CBC 426 : currpos += SizeOfXLogLongPHD;
1222 : }
3562 heikki.linnakangas 1223 ECB : else
1224 : {
3562 heikki.linnakangas 1225 GIC 490568 : CurrPos += SizeOfXLogShortPHD;
1226 490568 : currpos += SizeOfXLogShortPHD;
1227 : }
3562 heikki.linnakangas 1228 CBC 490994 : freespace = INSERT_FREESPACE(CurrPos);
3562 heikki.linnakangas 1229 ECB : }
1230 :
3562 heikki.linnakangas 1231 CBC 65938169 : Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
3562 heikki.linnakangas 1232 GIC 65938169 : memcpy(currpos, rdata_data, rdata_len);
1233 65938169 : currpos += rdata_len;
3562 heikki.linnakangas 1234 CBC 65938169 : CurrPos += rdata_len;
1235 65938169 : freespace -= rdata_len;
1236 65938169 : written += rdata_len;
3562 heikki.linnakangas 1237 ECB :
3562 heikki.linnakangas 1238 CBC 65938169 : rdata = rdata->next;
3562 heikki.linnakangas 1239 ECB : }
3562 heikki.linnakangas 1240 GIC 19399350 : Assert(written == write_len);
3562 heikki.linnakangas 1241 ECB :
1242 : /*
1243 : * If this was an xlog-switch, it's not enough to write the switch record,
1244 : * we also have to consume all the remaining space in the WAL segment. We
1245 : * have already reserved that space, but we need to actually fill it.
1246 : */
2028 andres 1247 GIC 19399350 : if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1248 : {
1249 : /* An xlog-switch record doesn't contain any data besides the header */
3562 heikki.linnakangas 1250 CBC 248 : Assert(write_len == SizeOfXLogRecord);
1251 :
1252 : /* Assert that we did reserve the right amount of space */
2028 andres 1253 248 : Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1254 :
1255 : /* Use up all the remaining space on the current page */
3562 heikki.linnakangas 1256 248 : CurrPos += freespace;
1257 :
1258 : /*
1836 tgl 1259 ECB : * Cause all remaining pages in the segment to be flushed, leaving the
1260 : * XLog position where it should be, at the start of the next segment.
1261 : * We do this one page at a time, to make sure we don't deadlock
1262 : * against ourselves if wal_buffers < wal_segment_size.
1263 : */
3562 heikki.linnakangas 1264 GIC 384232 : while (CurrPos < EndPos)
1265 : {
1266 : /*
1836 tgl 1267 ECB : * The minimal action to flush the page would be to call
1268 : * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1269 : * AdvanceXLInsertBuffer(...). The page would be left initialized
1270 : * mostly to zeros, except for the page header (always the short
1271 : * variant, as this is never a segment's first page).
1272 : *
1273 : * The large vistas of zeros are good for compressibility, but the
1274 : * headers interrupting them every XLOG_BLCKSZ (with values that
1275 : * differ from page to page) are not. The effect varies with
1276 : * compression tool, but bzip2 for instance compresses about an
1277 : * order of magnitude worse if those headers are left in place.
1278 : *
1279 : * Rather than complicating AdvanceXLInsertBuffer itself (which is
1280 : * called in heavily-loaded circumstances as well as this lightly-
1281 : * loaded one) with variant behavior, we just use GetXLogBuffer
1282 : * (which itself calls the two methods we need) to get the pointer
1283 : * and zero most of the page. Then we just zero the page header.
1284 : */
520 rhaas 1285 GIC 383984 : currpos = GetXLogBuffer(CurrPos, tli);
1836 tgl 1286 1535936 : MemSet(currpos, 0, SizeOfXLogShortPHD);
1287 :
3562 heikki.linnakangas 1288 CBC 383984 : CurrPos += XLOG_BLCKSZ;
3562 heikki.linnakangas 1289 ECB : }
1290 : }
3062 1291 : else
1292 : {
1293 : /* Align the end position, so that the next record starts aligned */
3062 heikki.linnakangas 1294 GIC 19399102 : CurrPos = MAXALIGN64(CurrPos);
1295 : }
1296 :
3562 heikki.linnakangas 1297 CBC 19399350 : if (CurrPos != EndPos)
3562 heikki.linnakangas 1298 UIC 0 : elog(PANIC, "space reserved for WAL record does not match what was written");
3562 heikki.linnakangas 1299 GIC 19399350 : }
3562 heikki.linnakangas 1300 ECB :
3562 heikki.linnakangas 1301 EUB : /*
3306 heikki.linnakangas 1302 ECB : * Acquire a WAL insertion lock, for inserting to WAL.
1303 : */
1304 : static void
3306 heikki.linnakangas 1305 GIC 19403953 : WALInsertLockAcquire(void)
1306 : {
1307 : bool immed;
3562 heikki.linnakangas 1308 ECB :
1309 : /*
1310 : * It doesn't matter which of the WAL insertion locks we acquire, so try
1311 : * the one we used last time. If the system isn't particularly busy, it's
1312 : * a good bet that it's still available, and it's good to have some
1313 : * affinity to a particular lock so that you don't unnecessarily bounce
1314 : * cache lines between processes when there's no contention.
1315 : *
1316 : * If this is the first time through in this backend, pick a lock
1317 : * (semi-)randomly. This allows the locks to be used evenly if you have a
1318 : * lot of very short connections.
1319 : */
1320 : static int lockToTry = -1;
1321 :
3306 heikki.linnakangas 1322 GIC 19403953 : if (lockToTry == -1)
3112 1323 5924 : lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
3306 1324 19403953 : MyLockNo = lockToTry;
3562 heikki.linnakangas 1325 ECB :
1326 : /*
3306 1327 : * The insertingAt value is initially set to 0, as we don't know our
1328 : * insert location yet.
1329 : */
2809 andres 1330 GIC 19403953 : immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
3306 heikki.linnakangas 1331 19403953 : if (!immed)
1332 : {
3562 heikki.linnakangas 1333 ECB : /*
3306 1334 : * If we couldn't get the lock immediately, try another lock next
1335 : * time. On a system with more insertion locks than concurrent
1336 : * inserters, this causes all the inserters to eventually migrate to a
1337 : * lock that no-one else is using. On a system with more inserters
1338 : * than locks, it still helps to distribute the inserters evenly
1339 : * across the locks.
1340 : */
3112 heikki.linnakangas 1341 GIC 635 : lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1342 : }
3562 1343 19403953 : }
3562 heikki.linnakangas 1344 ECB :
1345 : /*
3306 1346 : * Acquire all WAL insertion locks, to prevent other backends from inserting
1347 : * to WAL.
1348 : */
1349 : static void
3306 heikki.linnakangas 1350 GIC 3502 : WALInsertLockAcquireExclusive(void)
1351 : {
1352 : int i;
3562 heikki.linnakangas 1353 ECB :
1354 : /*
1355 : * When holding all the locks, all but the last lock's insertingAt
1356 : * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1357 : * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1358 : */
3112 heikki.linnakangas 1359 GIC 28016 : for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1360 : {
2809 andres 1361 24514 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
2809 andres 1362 CBC 24514 : LWLockUpdateVar(&WALInsertLocks[i].l.lock,
2809 andres 1363 GIC 24514 : &WALInsertLocks[i].l.insertingAt,
2809 andres 1364 ECB : PG_UINT64_MAX);
3562 heikki.linnakangas 1365 : }
2809 andres 1366 : /* Variable value reset to 0 at release */
2809 andres 1367 GIC 3502 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1368 :
3306 heikki.linnakangas 1369 3502 : holdingAllLocks = true;
3562 heikki.linnakangas 1370 CBC 3502 : }
1371 :
3562 heikki.linnakangas 1372 ECB : /*
3306 1373 : * Release our insertion lock (or locks, if we're holding them all).
1374 : *
1375 : * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1376 : * next time the lock is acquired.
1377 : */
1378 : static void
3306 heikki.linnakangas 1379 GIC 19407455 : WALInsertLockRelease(void)
1380 : {
1381 19407455 : if (holdingAllLocks)
3562 heikki.linnakangas 1382 ECB : {
1383 : int i;
3306 1384 :
3112 heikki.linnakangas 1385 GIC 31518 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
2809 andres 1386 28016 : LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1387 28016 : &WALInsertLocks[i].l.insertingAt,
2809 andres 1388 ECB : 0);
3562 heikki.linnakangas 1389 :
3306 heikki.linnakangas 1390 CBC 3502 : holdingAllLocks = false;
1391 : }
1392 : else
3562 heikki.linnakangas 1393 ECB : {
2809 andres 1394 GIC 19403953 : LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1395 19403953 : &WALInsertLocks[MyLockNo].l.insertingAt,
1396 : 0);
3562 heikki.linnakangas 1397 ECB : }
3562 heikki.linnakangas 1398 CBC 19407455 : }
1399 :
1400 : /*
3306 heikki.linnakangas 1401 ECB : * Update our insertingAt value, to let others know that we've finished
1402 : * inserting up to that point.
1403 : */
1404 : static void
3306 heikki.linnakangas 1405 GIC 742721 : WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1406 : {
1407 742721 : if (holdingAllLocks)
3562 heikki.linnakangas 1408 ECB : {
1409 : /*
3306 1410 : * We use the last lock to mark our actual position, see comments in
1411 : * WALInsertLockAcquireExclusive.
1412 : */
3112 heikki.linnakangas 1413 GIC 380439 : LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
2118 tgl 1414 380439 : &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1415 : insertingAt);
3562 heikki.linnakangas 1416 ECB : }
1417 : else
3306 heikki.linnakangas 1418 GIC 362282 : LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1419 362282 : &WALInsertLocks[MyLockNo].l.insertingAt,
1420 : insertingAt);
3562 heikki.linnakangas 1421 CBC 742721 : }
3562 heikki.linnakangas 1422 ECB :
1423 : /*
1424 : * Wait for any WAL insertions < upto to finish.
1425 : *
1426 : * Returns the location of the oldest insertion that is still in-progress.
1427 : * Any WAL prior to that point has been fully copied into WAL buffers, and
1428 : * can be flushed out to disk. Because this waits for any insertions older
1429 : * than 'upto' to finish, the return value is always >= 'upto'.
1430 : *
1431 : * Note: When you are about to write out WAL, you must call this function
1432 : * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1433 : * need to wait for an insertion to finish (or at least advance to next
1434 : * uninitialized page), and the inserter might need to evict an old WAL buffer
1435 : * to make room for a new one, which in turn requires WALWriteLock.
1436 : */
1437 : static XLogRecPtr
3562 heikki.linnakangas 1438 GIC 737182 : WaitXLogInsertionsToFinish(XLogRecPtr upto)
1439 : {
1440 : uint64 bytepos;
3562 heikki.linnakangas 1441 ECB : XLogRecPtr reservedUpto;
1442 : XLogRecPtr finishedUpto;
3121 andres 1443 GIC 737182 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1444 : int i;
1445 :
3562 heikki.linnakangas 1446 CBC 737182 : if (MyProc == NULL)
3562 heikki.linnakangas 1447 UIC 0 : elog(PANIC, "cannot wait without a PGPROC structure");
1448 :
3562 heikki.linnakangas 1449 ECB : /* Read the current insert position */
3562 heikki.linnakangas 1450 GBC 737182 : SpinLockAcquire(&Insert->insertpos_lck);
3562 heikki.linnakangas 1451 GIC 737182 : bytepos = Insert->CurrBytePos;
1452 737182 : SpinLockRelease(&Insert->insertpos_lck);
3562 heikki.linnakangas 1453 CBC 737182 : reservedUpto = XLogBytePosToEndRecPtr(bytepos);
3562 heikki.linnakangas 1454 ECB :
1455 : /*
1456 : * No-one should request to flush a piece of WAL that hasn't even been
1457 : * reserved yet. However, it can happen if there is a block with a bogus
1458 : * LSN on disk, for example. XLogFlush checks for that situation and
1459 : * complains, but only after the flush. Here we just assume that to mean
1460 : * that all WAL that has been reserved needs to be finished. In this
1461 : * corner-case, the return value can be smaller than 'upto' argument.
1462 : */
3562 heikki.linnakangas 1463 GIC 737182 : if (upto > reservedUpto)
1464 : {
856 peter 1465 UIC 0 : ereport(LOG,
856 peter 1466 ECB : (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
1467 : LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
3562 heikki.linnakangas 1468 UBC 0 : upto = reservedUpto;
1469 : }
1470 :
3562 heikki.linnakangas 1471 EUB : /*
1472 : * Loop through all the locks, sleeping on any in-progress insert older
1473 : * than 'upto'.
1474 : *
1475 : * finishedUpto is our return value, indicating the point upto which all
1476 : * the WAL insertions have been finished. Initialize it to the head of
1477 : * reserved WAL, and as we iterate through the insertion locks, back it
1478 : * out for any insertion that's still in progress.
1479 : */
3562 heikki.linnakangas 1480 GIC 737182 : finishedUpto = reservedUpto;
3112 1481 6634638 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1482 : {
3260 bruce 1483 CBC 5897456 : XLogRecPtr insertingat = InvalidXLogRecPtr;
3260 bruce 1484 ECB :
1485 : do
3562 heikki.linnakangas 1486 : {
1487 : /*
1488 : * See if this insertion is in progress. LWLockWaitForVar will
1489 : * wait for the lock to be released, or for the 'value' to be set
1490 : * by a LWLockUpdateVar call. When a lock is initially acquired,
1491 : * its value is 0 (InvalidXLogRecPtr), which means that we don't
1492 : * know where it's inserting yet. We will have to wait for it. If
1493 : * it's a small insertion, the record will most likely fit on the
1494 : * same page and the inserter will release the lock without ever
1495 : * calling LWLockUpdateVar. But if it has to sleep, it will
1496 : * advertise the insertion point with LWLockUpdateVar before
1497 : * sleeping.
1498 : */
3306 heikki.linnakangas 1499 GIC 5899012 : if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1500 5899012 : &WALInsertLocks[i].l.insertingAt,
1501 : insertingat, &insertingat))
3306 heikki.linnakangas 1502 ECB : {
1503 : /* the lock was free, so no insertion in progress */
3306 heikki.linnakangas 1504 GIC 2958401 : insertingat = InvalidXLogRecPtr;
1505 2958401 : break;
1506 : }
3562 heikki.linnakangas 1507 ECB :
1508 : /*
1509 : * This insertion is still in progress. Have to wait, unless the
1510 : * inserter has proceeded past 'upto'.
1511 : */
3306 heikki.linnakangas 1512 GIC 2940611 : } while (insertingat < upto);
1513 :
1514 5897456 : if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
3306 heikki.linnakangas 1515 CBC 435915 : finishedUpto = insertingat;
1516 : }
3562 1517 737182 : return finishedUpto;
3562 heikki.linnakangas 1518 ECB : }
1519 :
1520 : /*
1521 : * Get a pointer to the right location in the WAL buffer containing the
1522 : * given XLogRecPtr.
1523 : *
1524 : * If the page is not initialized yet, it is initialized. That might require
1525 : * evicting an old dirty buffer from the buffer cache, which means I/O.
1526 : *
1527 : * The caller must ensure that the page containing the requested location
1528 : * isn't evicted yet, and won't be evicted. The way to ensure that is to
1529 : * hold onto a WAL insertion lock with the insertingAt position set to
1530 : * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1531 : * to evict an old page from the buffer. (This means that once you call
1532 : * GetXLogBuffer() with a given 'ptr', you must not access anything before
1533 : * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1534 : * later, because older buffers might be recycled already)
1535 : */
1536 : static char *
520 rhaas 1537 GIC 20274329 : GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
1538 : {
1539 : int idx;
3562 heikki.linnakangas 1540 ECB : XLogRecPtr endptr;
1541 : static uint64 cachedPage = 0;
1542 : static char *cachedPos = NULL;
1543 : XLogRecPtr expectedEndPtr;
1544 :
1545 : /*
1546 : * Fast path for the common case that we need to access again the same
1547 : * page as last time.
1548 : */
3562 heikki.linnakangas 1549 GIC 20274329 : if (ptr / XLOG_BLCKSZ == cachedPage)
1550 : {
1551 19285796 : Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
3562 heikki.linnakangas 1552 CBC 19285796 : Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
3562 heikki.linnakangas 1553 GIC 19285796 : return cachedPos + ptr % XLOG_BLCKSZ;
6997 tgl 1554 ECB : }
1555 :
3562 heikki.linnakangas 1556 : /*
1557 : * The XLog buffer cache is organized so that a page is always loaded to a
1558 : * particular buffer. That way we can easily calculate the buffer a given
1559 : * page must be loaded into, from the XLogRecPtr alone.
1560 : */
3562 heikki.linnakangas 1561 GIC 988533 : idx = XLogRecPtrToBufIdx(ptr);
1562 :
1563 : /*
3562 heikki.linnakangas 1564 ECB : * See what page is loaded in the buffer at the moment. It could be the
1565 : * page we're looking for, or something older. It can't be anything newer
1566 : * - that would imply the page we're looking for has already been written
1567 : * out to disk and evicted, and the caller is responsible for making sure
1568 : * that doesn't happen.
1569 : *
1570 : * However, we don't hold a lock while we read the value. If someone has
1571 : * just initialized the page, it's possible that we get a "torn read" of
1572 : * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1573 : * that case we will see a bogus value. That's ok, we'll grab the mapping
1574 : * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1575 : * the page we're looking for. But it means that when we do this unlocked
1576 : * read, we might see a value that appears to be ahead of the page we're
1577 : * looking for. Don't PANIC on that, until we've verified the value while
1578 : * holding the lock.
1579 : */
3562 heikki.linnakangas 1580 GIC 988533 : expectedEndPtr = ptr;
1581 988533 : expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1582 :
3562 heikki.linnakangas 1583 CBC 988533 : endptr = XLogCtl->xlblocks[idx];
1584 988533 : if (expectedEndPtr != endptr)
1585 : {
2807 heikki.linnakangas 1586 ECB : XLogRecPtr initializedUpto;
1587 :
1588 : /*
1589 : * Before calling AdvanceXLInsertBuffer(), which can block, let others
1590 : * know how far we're finished with inserting the record.
1591 : *
1592 : * NB: If 'ptr' points to just after the page header, advertise a
1593 : * position at the beginning of the page rather than 'ptr' itself. If
1594 : * there are no other insertions running, someone might try to flush
1595 : * up to our advertised location. If we advertised a position after
1596 : * the page header, someone might try to flush the page header, even
1597 : * though page might actually not be initialized yet. As the first
1598 : * inserter on the page, we are effectively responsible for making
1599 : * sure that it's initialized, before we let insertingAt to move past
1600 : * the page header.
1601 : */
2807 heikki.linnakangas 1602 GIC 742721 : if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
2028 andres 1603 12172 : XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
2807 heikki.linnakangas 1604 12172 : initializedUpto = ptr - SizeOfXLogShortPHD;
2807 heikki.linnakangas 1605 CBC 730549 : else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
2028 andres 1606 343 : XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
2807 heikki.linnakangas 1607 173 : initializedUpto = ptr - SizeOfXLogLongPHD;
2807 heikki.linnakangas 1608 ECB : else
2807 heikki.linnakangas 1609 CBC 730376 : initializedUpto = ptr;
2807 heikki.linnakangas 1610 ECB :
2807 heikki.linnakangas 1611 GIC 742721 : WALInsertLockUpdateInsertingAt(initializedUpto);
6090 tgl 1612 ECB :
520 rhaas 1613 GIC 742721 : AdvanceXLInsertBuffer(ptr, tli, false);
3562 heikki.linnakangas 1614 CBC 742721 : endptr = XLogCtl->xlblocks[idx];
1615 :
1616 742721 : if (expectedEndPtr != endptr)
3562 heikki.linnakangas 1617 LBC 0 : elog(PANIC, "could not find WAL buffer for %X/%X",
1618 : LSN_FORMAT_ARGS(ptr));
3562 heikki.linnakangas 1619 ECB : }
3562 heikki.linnakangas 1620 EUB : else
1621 : {
1622 : /*
1623 : * Make sure the initialization of the page is visible to us, and
1624 : * won't arrive later to overwrite the WAL data we write on the page.
1625 : */
3562 heikki.linnakangas 1626 GIC 245812 : pg_memory_barrier();
1627 : }
1628 :
3562 heikki.linnakangas 1629 ECB : /*
1630 : * Found the buffer holding this page. Return a pointer to the right
1631 : * offset within the page.
1632 : */
3562 heikki.linnakangas 1633 GIC 988533 : cachedPage = ptr / XLOG_BLCKSZ;
1634 988533 : cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1635 :
3562 heikki.linnakangas 1636 CBC 988533 : Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1637 988533 : Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1638 :
1639 988533 : return cachedPos + ptr % XLOG_BLCKSZ;
3562 heikki.linnakangas 1640 ECB : }
1641 :
1642 : /*
1643 : * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1644 : * is the position starting from the beginning of WAL, excluding all WAL
1645 : * page headers.
1646 : */
1647 : static XLogRecPtr
3562 heikki.linnakangas 1648 GIC 38806008 : XLogBytePosToRecPtr(uint64 bytepos)
1649 : {
1650 : uint64 fullsegs;
3562 heikki.linnakangas 1651 ECB : uint64 fullpages;
1652 : uint64 bytesleft;
1653 : uint32 seg_offset;
1654 : XLogRecPtr result;
1655 :
3562 heikki.linnakangas 1656 GIC 38806008 : fullsegs = bytepos / UsableBytesInSegment;
1657 38806008 : bytesleft = bytepos % UsableBytesInSegment;
1658 :
3562 heikki.linnakangas 1659 CBC 38806008 : if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
8205 vadim4o 1660 ECB : {
1661 : /* fits on first page of segment */
3562 heikki.linnakangas 1662 CBC 73916 : seg_offset = bytesleft + SizeOfXLogLongPHD;
1663 : }
1664 : else
8595 vadim4o 1665 ECB : {
1666 : /* account for the first page on segment with long header */
3562 heikki.linnakangas 1667 GIC 38732092 : seg_offset = XLOG_BLCKSZ;
1668 38732092 : bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1669 :
3562 heikki.linnakangas 1670 CBC 38732092 : fullpages = bytesleft / UsableBytesInPage;
1671 38732092 : bytesleft = bytesleft % UsableBytesInPage;
1672 :
1673 38732092 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
8595 vadim4o 1674 ECB : }
1675 :
1735 alvherre 1676 CBC 38806008 : XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1677 :
3562 heikki.linnakangas 1678 GIC 38806008 : return result;
3562 heikki.linnakangas 1679 ECB : }
1680 :
1681 : /*
1682 : * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1683 : * returns a pointer to the beginning of the page (ie. before page header),
1684 : * not to where the first xlog record on that page would go to. This is used
1685 : * when converting a pointer to the end of a record.
1686 : */
1687 : static XLogRecPtr
3562 heikki.linnakangas 1688 GIC 20136832 : XLogBytePosToEndRecPtr(uint64 bytepos)
1689 : {
1690 : uint64 fullsegs;
3562 heikki.linnakangas 1691 ECB : uint64 fullpages;
1692 : uint64 bytesleft;
1693 : uint32 seg_offset;
1694 : XLogRecPtr result;
1695 :
3562 heikki.linnakangas 1696 GIC 20136832 : fullsegs = bytepos / UsableBytesInSegment;
1697 20136832 : bytesleft = bytepos % UsableBytesInSegment;
1698 :
3562 heikki.linnakangas 1699 CBC 20136832 : if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
3562 heikki.linnakangas 1700 ECB : {
1701 : /* fits on first page of segment */
3562 heikki.linnakangas 1702 CBC 394401 : if (bytesleft == 0)
3562 heikki.linnakangas 1703 GIC 357373 : seg_offset = 0;
1704 : else
3562 heikki.linnakangas 1705 CBC 37028 : seg_offset = bytesleft + SizeOfXLogLongPHD;
3562 heikki.linnakangas 1706 ECB : }
1707 : else
6090 tgl 1708 : {
1709 : /* account for the first page on segment with long header */
3562 heikki.linnakangas 1710 GIC 19742431 : seg_offset = XLOG_BLCKSZ;
1711 19742431 : bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1712 :
3562 heikki.linnakangas 1713 CBC 19742431 : fullpages = bytesleft / UsableBytesInPage;
1714 19742431 : bytesleft = bytesleft % UsableBytesInPage;
1715 :
1716 19742431 : if (bytesleft == 0)
1717 20224 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1718 : else
1719 19722207 : seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
3562 heikki.linnakangas 1720 ECB : }
1721 :
1735 alvherre 1722 CBC 20136832 : XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1723 :
3562 heikki.linnakangas 1724 GIC 20136832 : return result;
3562 heikki.linnakangas 1725 ECB : }
1726 :
1727 : /*
1728 : * Convert an XLogRecPtr to a "usable byte position".
1729 : */
1730 : static uint64
3562 heikki.linnakangas 1731 GIC 58200582 : XLogRecPtrToBytePos(XLogRecPtr ptr)
1732 : {
1733 : uint64 fullsegs;
3562 heikki.linnakangas 1734 ECB : uint32 fullpages;
1735 : uint32 offset;
1736 : uint64 result;
1737 :
2028 andres 1738 GIC 58200582 : XLByteToSeg(ptr, fullsegs, wal_segment_size);
1739 :
1740 58200582 : fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
3562 heikki.linnakangas 1741 CBC 58200582 : offset = ptr % XLOG_BLCKSZ;
1742 :
1743 58200582 : if (fullpages == 0)
3562 heikki.linnakangas 1744 ECB : {
3562 heikki.linnakangas 1745 GIC 111171 : result = fullsegs * UsableBytesInSegment;
3562 heikki.linnakangas 1746 CBC 111171 : if (offset > 0)
1747 : {
1748 110641 : Assert(offset >= SizeOfXLogLongPHD);
1749 110641 : result += offset - SizeOfXLogLongPHD;
1750 : }
6090 tgl 1751 ECB : }
8595 vadim4o 1752 : else
1753 : {
3562 heikki.linnakangas 1754 GIC 58089411 : result = fullsegs * UsableBytesInSegment +
3260 bruce 1755 58089411 : (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2118 tgl 1756 58089411 : (fullpages - 1) * UsableBytesInPage; /* full pages */
3562 heikki.linnakangas 1757 CBC 58089411 : if (offset > 0)
6090 tgl 1758 ECB : {
3562 heikki.linnakangas 1759 CBC 58069581 : Assert(offset >= SizeOfXLogShortPHD);
1760 58069581 : result += offset - SizeOfXLogShortPHD;
1761 : }
8595 vadim4o 1762 ECB : }
1763 :
3562 heikki.linnakangas 1764 GIC 58200582 : return result;
1765 : }
1766 :
8062 tgl 1767 ECB : /*
1768 : * Initialize XLOG buffers, writing out old buffers if they still contain
1769 : * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1770 : * true, initialize as many pages as we can without having to write out
1771 : * unwritten data. Any new pages are initialized to zeros, with pages headers
1772 : * initialized properly.
1773 : */
1774 : static void
520 rhaas 1775 GIC 749930 : AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
1776 : {
8062 tgl 1777 749930 : XLogCtlInsert *Insert = &XLogCtl->Insert;
3562 heikki.linnakangas 1778 ECB : int nextidx;
1779 : XLogRecPtr OldPageRqstPtr;
8062 tgl 1780 : XLogwrtRqst WriteRqst;
3562 heikki.linnakangas 1781 GIC 749930 : XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
1782 : XLogRecPtr NewPageBeginPtr;
1783 : XLogPageHeader NewPage;
201 tgl 1784 CBC 749930 : int npages pg_attribute_unused() = 0;
1785 :
3562 heikki.linnakangas 1786 GIC 749930 : LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
8595 vadim4o 1787 ECB :
1788 : /*
3562 heikki.linnakangas 1789 : * Now that we have the lock, check if someone initialized the page
1790 : * already.
1791 : */
3553 heikki.linnakangas 1792 GIC 2102859 : while (upto >= XLogCtl->InitializedUpTo || opportunistic)
1793 : {
1794 1360138 : nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
7862 tgl 1795 ECB :
1796 : /*
3562 heikki.linnakangas 1797 : * Get ending-offset of the buffer page we need to replace (this may
1798 : * be zero if the buffer hasn't been used yet). Fall through if it's
1799 : * already written out.
1800 : */
3562 heikki.linnakangas 1801 GIC 1360138 : OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
3754 alvherre 1802 1360138 : if (LogwrtResult.Write < OldPageRqstPtr)
1803 : {
3562 heikki.linnakangas 1804 ECB : /*
1805 : * Nope, got work to do. If we just want to pre-initialize as much
1806 : * as we can without flushing, give up now.
1807 : */
3562 heikki.linnakangas 1808 GIC 442792 : if (opportunistic)
1809 7209 : break;
1810 :
3562 heikki.linnakangas 1811 ECB : /* Before waiting, get info_lck and update LogwrtResult */
3121 andres 1812 CBC 435583 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 1813 GIC 435583 : if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
1814 357826 : XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
3121 andres 1815 CBC 435583 : LogwrtResult = XLogCtl->LogwrtResult;
1816 435583 : SpinLockRelease(&XLogCtl->info_lck);
3562 heikki.linnakangas 1817 ECB :
1818 : /*
1819 : * Now that we have an up-to-date LogwrtResult value, see if we
1820 : * still need to write it or if someone else already did.
1821 : */
3562 heikki.linnakangas 1822 GIC 435583 : if (LogwrtResult.Write < OldPageRqstPtr)
1823 : {
1824 : /*
3562 heikki.linnakangas 1825 ECB : * Must acquire write lock. Release WALBufMappingLock first,
1826 : * to make sure that all insertions that we need to wait for
1827 : * can finish (up to this same position). Otherwise we risk
1828 : * deadlock.
1829 : */
3562 heikki.linnakangas 1830 GIC 433903 : LWLockRelease(WALBufMappingLock);
1831 :
1832 433903 : WaitXLogInsertionsToFinish(OldPageRqstPtr);
3562 heikki.linnakangas 1833 ECB :
3562 heikki.linnakangas 1834 GIC 433903 : LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3562 heikki.linnakangas 1835 ECB :
3562 heikki.linnakangas 1836 GIC 433903 : LogwrtResult = XLogCtl->LogwrtResult;
3562 heikki.linnakangas 1837 CBC 433903 : if (LogwrtResult.Write >= OldPageRqstPtr)
1838 : {
3562 heikki.linnakangas 1839 ECB : /* OK, someone wrote it already */
3562 heikki.linnakangas 1840 CBC 1477 : LWLockRelease(WALWriteLock);
1841 : }
1842 : else
3562 heikki.linnakangas 1843 ECB : {
1844 : /* Have to write it ourselves */
1845 : TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
3562 heikki.linnakangas 1846 GIC 432426 : WriteRqst.Write = OldPageRqstPtr;
1847 432426 : WriteRqst.Flush = 0;
520 rhaas 1848 432426 : XLogWrite(WriteRqst, tli, false);
3562 heikki.linnakangas 1849 CBC 432426 : LWLockRelease(WALWriteLock);
368 andres 1850 432426 : PendingWalStats.wal_buffers_full++;
3562 heikki.linnakangas 1851 ECB : TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1852 : }
1853 : /* Re-acquire WALBufMappingLock and retry */
3562 heikki.linnakangas 1854 GIC 433903 : LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1855 433903 : continue;
1856 : }
8595 vadim4o 1857 ECB : }
1858 :
1859 : /*
1860 : * Now the next buffer slot is free and we can set it up to be the
1861 : * next output page.
1862 : */
3553 heikki.linnakangas 1863 GIC 919026 : NewPageBeginPtr = XLogCtl->InitializedUpTo;
3562 1864 919026 : NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
1865 :
3562 heikki.linnakangas 1866 CBC 919026 : Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
6090 tgl 1867 ECB :
3562 heikki.linnakangas 1868 GIC 919026 : NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
6385 bruce 1869 ECB :
1870 : /*
3562 heikki.linnakangas 1871 : * Be sure to re-zero the buffer so that bytes beyond what we've
1872 : * written will look like zeroes and not valid XLOG records...
1873 : */
3562 heikki.linnakangas 1874 GIC 919026 : MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1875 :
1876 : /*
3562 heikki.linnakangas 1877 ECB : * Fill the new page's header
1878 : */
2878 bruce 1879 GIC 919026 : NewPage->xlp_magic = XLOG_PAGE_MAGIC;
1880 :
1881 : /* NewPage->xlp_info = 0; */ /* done by memset */
520 rhaas 1882 CBC 919026 : NewPage->xlp_tli = tli;
2878 bruce 1883 GIC 919026 : NewPage->xlp_pageaddr = NewPageBeginPtr;
1884 :
3260 bruce 1885 ECB : /* NewPage->xlp_rem_len = 0; */ /* done by memset */
7934 tgl 1886 :
1887 : /*
1888 : * If online backup is not in progress, mark the header to indicate
1889 : * that WAL records beginning in this page have removable backup
1890 : * blocks. This allows the WAL archiver to know whether it is safe to
1891 : * compress archived WAL data by transforming full-block records into
1892 : * the non-full-block format. It is sufficient to record this at the
1893 : * page level because we force a page switch (in fact a segment
1894 : * switch) when starting a backup, so the flag will be off before any
1895 : * records can be written during the backup. At the end of a backup,
1896 : * the last page will be marked as all unsafe when perhaps only part
1897 : * is unsafe, but at worst the archiver would miss the opportunity to
1898 : * compress a few records.
1899 : */
172 alvherre 1900 GNC 919026 : if (Insert->runningBackups == 0)
2878 bruce 1901 GIC 810359 : NewPage->xlp_info |= XLP_BKP_REMOVABLE;
1902 :
3562 heikki.linnakangas 1903 ECB : /*
1904 : * If first page of an XLOG segment file, make it a long header.
1905 : */
2028 andres 1906 GIC 919026 : if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
1907 : {
3562 heikki.linnakangas 1908 613 : XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
8062 tgl 1909 ECB :
3562 heikki.linnakangas 1910 GIC 613 : NewLongPage->xlp_sysid = ControlFile->system_identifier;
2028 andres 1911 CBC 613 : NewLongPage->xlp_seg_size = wal_segment_size;
3562 heikki.linnakangas 1912 GIC 613 : NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2878 bruce 1913 CBC 613 : NewPage->xlp_info |= XLP_LONG_HEADER;
3562 heikki.linnakangas 1914 ECB : }
4136 tgl 1915 :
3562 heikki.linnakangas 1916 : /*
1917 : * Make sure the initialization of the page becomes visible to others
1918 : * before the xlblocks update. GetXLogBuffer() reads xlblocks without
1919 : * holding a lock.
1920 : */
3562 heikki.linnakangas 1921 GIC 919026 : pg_write_barrier();
1922 :
1923 919026 : *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
6997 tgl 1924 ECB :
3553 heikki.linnakangas 1925 GIC 919026 : XLogCtl->InitializedUpTo = NewPageEndPtr;
6385 bruce 1926 ECB :
3562 heikki.linnakangas 1927 GIC 919026 : npages++;
6997 tgl 1928 ECB : }
3562 heikki.linnakangas 1929 GIC 749930 : LWLockRelease(WALBufMappingLock);
6997 tgl 1930 ECB :
1931 : #ifdef WAL_DEBUG
2849 andres 1932 : if (XLOG_DEBUG && npages > 0)
1933 : {
1934 : elog(DEBUG1, "initialized %d pages, up to %X/%X",
1935 : npages, LSN_FORMAT_ARGS(NewPageEndPtr));
1936 : }
1937 : #endif
8595 vadim4o 1938 GIC 749930 : }
1939 :
1940 : /*
2196 simon 1941 ECB : * Calculate CheckPointSegments based on max_wal_size_mb and
1942 : * checkpoint_completion_target.
1943 : */
1944 : static void
2967 heikki.linnakangas 1945 GIC 9091 : CalculateCheckpointSegments(void)
1946 : {
1947 : double target;
2967 heikki.linnakangas 1948 ECB :
1949 : /*-------
1950 : * Calculate the distance at which to trigger a checkpoint, to avoid
1951 : * exceeding max_wal_size_mb. This is based on two assumptions:
1952 : *
1953 : * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
1954 : * WAL for two checkpoint cycles to allow us to recover from the
1955 : * secondary checkpoint if the first checkpoint failed, though we
1956 : * only did this on the primary anyway, not on standby. Keeping just
1957 : * one checkpoint simplifies processing and reduces disk space in
1958 : * many smaller databases.)
1959 : * b) during checkpoint, we consume checkpoint_completion_target *
1960 : * number of segments consumed between checkpoints.
1961 : *-------
1962 : */
2028 andres 1963 GIC 9091 : target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
1979 simon 1964 9091 : (1.0 + CheckPointCompletionTarget);
1965 :
2967 heikki.linnakangas 1966 ECB : /* round down */
2967 heikki.linnakangas 1967 CBC 9091 : CheckPointSegments = (int) target;
1968 :
2967 heikki.linnakangas 1969 GIC 9091 : if (CheckPointSegments < 1)
2967 heikki.linnakangas 1970 CBC 9 : CheckPointSegments = 1;
2967 heikki.linnakangas 1971 GIC 9091 : }
2967 heikki.linnakangas 1972 ECB :
1973 : void
2967 heikki.linnakangas 1974 CBC 6012 : assign_max_wal_size(int newval, void *extra)
1975 : {
2196 simon 1976 GIC 6012 : max_wal_size_mb = newval;
2967 heikki.linnakangas 1977 CBC 6012 : CalculateCheckpointSegments();
2967 heikki.linnakangas 1978 GIC 6012 : }
2967 heikki.linnakangas 1979 ECB :
1980 : void
2967 heikki.linnakangas 1981 CBC 1857 : assign_checkpoint_completion_target(double newval, void *extra)
1982 : {
2967 heikki.linnakangas 1983 GIC 1857 : CheckPointCompletionTarget = newval;
2967 heikki.linnakangas 1984 CBC 1857 : CalculateCheckpointSegments();
2967 heikki.linnakangas 1985 GIC 1857 : }
2967 heikki.linnakangas 1986 ECB :
1987 : /*
1988 : * At a checkpoint, how many WAL segments to recycle as preallocated future
1989 : * XLOG segments? Returns the highest segment that should be preallocated.
1990 : */
1991 : static XLogSegNo
1208 michael 1992 GIC 2363 : XLOGfileslop(XLogRecPtr lastredoptr)
1993 : {
1994 : XLogSegNo minSegNo;
2967 heikki.linnakangas 1995 ECB : XLogSegNo maxSegNo;
1996 : double distance;
1997 : XLogSegNo recycleSegNo;
1998 :
1999 : /*
2000 : * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2001 : * correspond to. Always recycle enough segments to meet the minimum, and
2002 : * remove enough segments to stay below the maximum.
2003 : */
1208 michael 2004 GIC 2363 : minSegNo = lastredoptr / wal_segment_size +
2028 andres 2005 2363 : ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
1208 michael 2006 2363 : maxSegNo = lastredoptr / wal_segment_size +
2028 andres 2007 CBC 2363 : ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2967 heikki.linnakangas 2008 ECB :
2009 : /*
2010 : * Between those limits, recycle enough segments to get us through to the
2011 : * estimated end of next checkpoint.
2012 : *
2013 : * To estimate where the next checkpoint will finish, assume that the
2014 : * system runs steadily consuming CheckPointDistanceEstimate bytes between
2015 : * every checkpoint.
2016 : */
1979 simon 2017 GIC 2363 : distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2018 : /* add 10% for good measure. */
2967 heikki.linnakangas 2019 2363 : distance *= 1.10;
2967 heikki.linnakangas 2020 ECB :
1208 michael 2021 GIC 2363 : recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2028 andres 2022 ECB : wal_segment_size);
2023 :
2967 heikki.linnakangas 2024 CBC 2363 : if (recycleSegNo < minSegNo)
2967 heikki.linnakangas 2025 GIC 2219 : recycleSegNo = minSegNo;
2026 2363 : if (recycleSegNo > maxSegNo)
2967 heikki.linnakangas 2027 CBC 55 : recycleSegNo = maxSegNo;
2967 heikki.linnakangas 2028 ECB :
2967 heikki.linnakangas 2029 CBC 2363 : return recycleSegNo;
2967 heikki.linnakangas 2030 ECB : }
2031 :
5658 tgl 2032 : /*
2033 : * Check whether we've consumed enough xlog space that a checkpoint is needed.
2034 : *
2035 : * new_segno indicates a log file that has just been filled up (or read
2036 : * during recovery). We measure the distance from RedoRecPtr to new_segno
2037 : * and see if that exceeds CheckPointSegments.
2038 : *
2039 : * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2040 : */
2041 : bool
3941 heikki.linnakangas 2042 GIC 865 : XLogCheckpointNeeded(XLogSegNo new_segno)
2043 : {
2044 : XLogSegNo old_segno;
3941 heikki.linnakangas 2045 ECB :
2028 andres 2046 GIC 865 : XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2047 :
3941 heikki.linnakangas 2048 865 : if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
5658 tgl 2049 CBC 147 : return true;
5658 tgl 2050 GIC 718 : return false;
5658 tgl 2051 ECB : }
2052 :
8062 2053 : /*
2054 : * Write and/or fsync the log at least as far as WriteRqst indicates.
2055 : *
2056 : * If flexible == true, we don't have to write as far as WriteRqst, but
2057 : * may stop at any convenient boundary (such as a cache or logfile boundary).
2058 : * This option allows us to avoid uselessly issuing multiple writes when a
2059 : * single one would do.
2060 : *
2061 : * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2062 : * must be called before grabbing the lock, to make sure the data is ready to
2063 : * write.
2064 : */
2065 : static void
520 rhaas 2066 GIC 731996 : XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
2067 : {
2068 : bool ispartialpage;
6090 tgl 2069 ECB : bool last_iteration;
2070 : bool finishing_seg;
2071 : int curridx;
2072 : int npages;
2073 : int startidx;
2074 : uint32 startoffset;
2075 :
2076 : /* We should always be inside a critical section here */
6568 tgl 2077 GIC 731996 : Assert(CritSectionCount > 0);
2078 :
2079 : /*
6385 bruce 2080 ECB : * Update local LogwrtResult (caller probably did this already, but...)
2081 : */
4051 heikki.linnakangas 2082 GIC 731996 : LogwrtResult = XLogCtl->LogwrtResult;
2083 :
2084 : /*
6439 tgl 2085 ECB : * Since successive pages in the xlog cache are consecutively allocated,
2086 : * we can usually gather multiple pages together and issue just one
2087 : * write() call. npages is the number of pages we have determined can be
2088 : * written together; startidx is the cache block index of the first one,
2089 : * and startoffset is the file offset at which it should go. The latter
2090 : * two variables are only valid when npages > 0, but we must initialize
2091 : * all of them to keep the compiler quiet.
2092 : */
6439 tgl 2093 GIC 731996 : npages = 0;
2094 731996 : startidx = 0;
2095 731996 : startoffset = 0;
6439 tgl 2096 ECB :
2097 : /*
2098 : * Within the loop, curridx is the cache block index of the page to
2099 : * consider writing. Begin at the buffer containing the next unwritten
2100 : * page, or last partially written page.
2101 : */
3553 heikki.linnakangas 2102 GIC 731996 : curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2103 :
3754 alvherre 2104 1617944 : while (LogwrtResult.Write < WriteRqst.Write)
8595 vadim4o 2105 ECB : {
2106 : /*
6385 bruce 2107 : * Make sure we're not ahead of the insert process. This could happen
2108 : * if we're passed a bogus WriteRqst.Write that is past the end of the
2109 : * last page that's been initialized by AdvanceXLInsertBuffer.
2110 : */
3260 bruce 2111 GIC 1184781 : XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2112 :
3562 heikki.linnakangas 2113 1184781 : if (LogwrtResult.Write >= EndPtr)
7202 tgl 2114 LBC 0 : elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2115 : LSN_FORMAT_ARGS(LogwrtResult.Write),
775 peter 2116 ECB : LSN_FORMAT_ARGS(EndPtr));
8059 tgl 2117 EUB :
2118 : /* Advance LogwrtResult.Write to end of current buffer page */
3562 heikki.linnakangas 2119 GIC 1184781 : LogwrtResult.Write = EndPtr;
3754 alvherre 2120 1184781 : ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2121 :
2028 andres 2122 CBC 1184781 : if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2028 andres 2123 ECB : wal_segment_size))
2124 : {
8062 tgl 2125 : /*
2126 : * Switch to new logfile segment. We cannot have any pending
2127 : * pages here (since we dump what we have at segment end).
2128 : */
6439 tgl 2129 GIC 7172 : Assert(npages == 0);
8062 2130 7172 : if (openLogFile >= 0)
6142 bruce 2131 1626 : XLogFileClose();
2028 andres 2132 CBC 7172 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2028 andres 2133 ECB : wal_segment_size);
520 rhaas 2134 CBC 7172 : openLogTLI = tli;
8062 tgl 2135 ECB :
2136 : /* create/use new log file */
520 rhaas 2137 CBC 7172 : openLogFile = XLogFileInit(openLogSegNo, tli);
1140 tgl 2138 GIC 7172 : ReserveExternalFD();
2139 : }
8595 vadim4o 2140 ECB :
6439 tgl 2141 : /* Make sure we have the current logfile open */
8062 tgl 2142 GIC 1184781 : if (openLogFile < 0)
2143 : {
2028 andres 2144 UIC 0 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2028 andres 2145 ECB : wal_segment_size);
520 rhaas 2146 UIC 0 : openLogTLI = tli;
520 rhaas 2147 UBC 0 : openLogFile = XLogFileOpen(openLogSegNo, tli);
1140 tgl 2148 UIC 0 : ReserveExternalFD();
8595 vadim4o 2149 EUB : }
2150 :
6439 tgl 2151 : /* Add current page to the set of pending pages-to-dump */
6439 tgl 2152 GIC 1184781 : if (npages == 0)
2153 : {
2154 : /* first of group */
6439 tgl 2155 CBC 736563 : startidx = curridx;
2028 andres 2156 GIC 736563 : startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2157 : wal_segment_size);
6439 tgl 2158 ECB : }
6439 tgl 2159 CBC 1184781 : npages++;
2160 :
2161 : /*
6385 bruce 2162 ECB : * Dump the set if this will be the last loop iteration, or if we are
2163 : * at the last page of the cache area (since the next page won't be
2164 : * contiguous in memory), or if we are at the end of the logfile
2165 : * segment.
2166 : */
3754 alvherre 2167 GIC 1184781 : last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2168 :
6439 tgl 2169 2075101 : finishing_seg = !ispartialpage &&
2028 andres 2170 CBC 890320 : (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2171 :
6090 tgl 2172 1184781 : if (last_iteration ||
6439 2173 453199 : curridx == XLogCtl->XLogCacheBlck ||
2174 : finishing_seg)
8062 tgl 2175 ECB : {
6439 2176 : char *from;
2177 : Size nbytes;
2178 : Size nleft;
2179 : int written;
2180 : instr_time start;
2181 :
2182 : /* OK to write the page(s) */
6215 tgl 2183 GIC 736563 : from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2184 736563 : nbytes = npages * (Size) XLOG_BLCKSZ;
3569 heikki.linnakangas 2185 736563 : nleft = nbytes;
3569 heikki.linnakangas 2186 ECB : do
6439 tgl 2187 : {
3569 heikki.linnakangas 2188 CBC 736563 : errno = 0;
2189 :
2190 : /* Measure I/O timing to write WAL data */
761 fujii 2191 736563 : if (track_wal_io_timing)
761 fujii 2192 UIC 0 : INSTR_TIME_SET_CURRENT(start);
2193 : else
79 andres 2194 GNC 736563 : INSTR_TIME_SET_ZERO(start);
2195 :
2213 rhaas 2196 CBC 736563 : pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
192 tmunro 2197 GBC 736563 : written = pg_pwrite(openLogFile, from, nleft, startoffset);
2213 rhaas 2198 GIC 736563 : pgstat_report_wait_end();
761 fujii 2199 ECB :
2200 : /*
2201 : * Increment the I/O timing and the number of times WAL data
2202 : * were written out to disk.
2203 : */
761 fujii 2204 GIC 736563 : if (track_wal_io_timing)
2205 : {
2206 : instr_time duration;
2207 :
761 fujii 2208 UIC 0 : INSTR_TIME_SET_CURRENT(duration);
10 andres 2209 UNC 0 : INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, duration, start);
2210 : }
2211 :
368 andres 2212 GBC 736563 : PendingWalStats.wal_write++;
761 fujii 2213 EUB :
3569 heikki.linnakangas 2214 GIC 736563 : if (written <= 0)
2215 : {
1223 michael 2216 ECB : char xlogfname[MAXFNAMELEN];
2217 : int save_errno;
2218 :
3569 heikki.linnakangas 2219 UIC 0 : if (errno == EINTR)
2220 0 : continue;
2221 :
1223 michael 2222 0 : save_errno = errno;
520 rhaas 2223 UBC 0 : XLogFileName(xlogfname, tli, openLogSegNo,
1223 michael 2224 EUB : wal_segment_size);
1223 michael 2225 UIC 0 : errno = save_errno;
3569 heikki.linnakangas 2226 UBC 0 : ereport(PANIC,
3569 heikki.linnakangas 2227 EUB : (errcode_for_file_access(),
2228 : errmsg("could not write to log file %s "
3363 tgl 2229 : "at offset %u, length %zu: %m",
1223 michael 2230 : xlogfname, startoffset, nleft)));
2231 : }
3569 heikki.linnakangas 2232 GIC 736563 : nleft -= written;
2233 736563 : from += written;
1614 tmunro 2234 736563 : startoffset += written;
3569 heikki.linnakangas 2235 736563 : } while (nleft > 0);
6439 tgl 2236 ECB :
6439 tgl 2237 CBC 736563 : npages = 0;
6439 tgl 2238 ECB :
2239 : /*
2240 : * If we just wrote the whole last page of a logfile segment,
2241 : * fsync the segment immediately. This avoids having to go back
2242 : * and re-open prior segments when an fsync request comes along
2243 : * later. Doing it here ensures that one and only one backend will
2244 : * perform this fsync.
2245 : *
2246 : * This is also the right place to notify the Archiver that the
2247 : * segment is ready to copy to archival storage, and to update the
2248 : * timer for archive_timeout, and to signal for a checkpoint if
2249 : * too many logfile segments have been used since the last
2250 : * checkpoint.
2251 : */
3562 heikki.linnakangas 2252 GIC 736563 : if (finishing_seg)
2253 : {
520 rhaas 2254 692 : issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2255 :
3933 rhaas 2256 ECB : /* signal that we need to wakeup walsenders later */
3933 rhaas 2257 GIC 692 : WalSndWakeupRequest();
3933 rhaas 2258 ECB :
2118 tgl 2259 GIC 692 : LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2260 :
6439 tgl 2261 CBC 692 : if (XLogArchivingActive())
520 rhaas 2262 GIC 33 : XLogArchiveNotifySeg(openLogSegNo, tli);
6079 tgl 2263 ECB :
3553 heikki.linnakangas 2264 GIC 692 : XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2299 andres 2265 CBC 692 : XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
5966 tgl 2266 ECB :
2267 : /*
3955 bruce 2268 : * Request a checkpoint if we've consumed too much xlog since
2269 : * the last one. For speed, we first check using the local
2270 : * copy of RedoRecPtr, which might be out of date; if it looks
2271 : * like a checkpoint is needed, forcibly update RedoRecPtr and
2272 : * recheck.
2273 : */
3941 heikki.linnakangas 2274 GIC 692 : if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2275 : {
5658 tgl 2276 40 : (void) GetRedoRecPtr();
3941 heikki.linnakangas 2277 40 : if (XLogCheckpointNeeded(openLogSegNo))
5762 tgl 2278 CBC 30 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2279 : }
6439 tgl 2280 ECB : }
8062 2281 : }
8595 vadim4o 2282 :
8062 tgl 2283 GIC 1184781 : if (ispartialpage)
2284 : {
2285 : /* Only asked to write a partial page */
2286 294461 : LogwrtResult.Write = WriteRqst.Write;
8062 tgl 2287 CBC 294461 : break;
2288 : }
6439 tgl 2289 GIC 890320 : curridx = NextBufIdx(curridx);
6439 tgl 2290 ECB :
2291 : /* If flexible, break out of loop as soon as we wrote something */
6439 tgl 2292 GIC 890320 : if (flexible && npages == 0)
6439 tgl 2293 CBC 4372 : break;
2294 : }
2295 :
2296 731996 : Assert(npages == 0);
8595 vadim4o 2297 ECB :
2298 : /*
2299 : * If asked to flush, do so
8062 tgl 2300 : */
3754 alvherre 2301 GIC 731996 : if (LogwrtResult.Flush < WriteRqst.Flush &&
2302 296291 : LogwrtResult.Flush < LogwrtResult.Write)
2303 : {
2304 : /*
6385 bruce 2305 ECB : * Could get here without iterating above loop, in which case we might
3260 2306 : * have no open file or the wrong one. However, we do not need to
2307 : * fsync more than one file.
2308 : */
5441 tgl 2309 GIC 296260 : if (sync_method != SYNC_METHOD_OPEN &&
2310 296260 : sync_method != SYNC_METHOD_OPEN_DSYNC)
2311 : {
8059 2312 296260 : if (openLogFile >= 0 &&
2028 andres 2313 CBC 296253 : !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2028 andres 2314 ECB : wal_segment_size))
6142 bruce 2315 UIC 0 : XLogFileClose();
8059 tgl 2316 CBC 296260 : if (openLogFile < 0)
8059 tgl 2317 ECB : {
2028 andres 2318 GIC 7 : XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2028 andres 2319 EUB : wal_segment_size);
520 rhaas 2320 CBC 7 : openLogTLI = tli;
520 rhaas 2321 GIC 7 : openLogFile = XLogFileOpen(openLogSegNo, tli);
1140 tgl 2322 CBC 7 : ReserveExternalFD();
2323 : }
3933 rhaas 2324 ECB :
520 rhaas 2325 CBC 296260 : issue_xlog_fsync(openLogFile, openLogSegNo, tli);
8062 tgl 2326 ECB : }
2327 :
2328 : /* signal that we need to wakeup walsenders later */
3933 rhaas 2329 CBC 296260 : WalSndWakeupRequest();
2330 :
8062 tgl 2331 GIC 296260 : LogwrtResult.Flush = LogwrtResult.Write;
2332 : }
8595 vadim4o 2333 ECB :
2334 : /*
8062 tgl 2335 : * Update shared-memory status
2336 : *
2337 : * We make sure that the shared 'request' values do not fall behind the
2338 : * 'result' values. This is not absolutely essential, but it saves some
2339 : * code in a couple of places.
2340 : */
2341 : {
3121 andres 2342 GIC 731996 : SpinLockAcquire(&XLogCtl->info_lck);
2343 731996 : XLogCtl->LogwrtResult = LogwrtResult;
2344 731996 : if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2345 285674 : XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
3121 andres 2346 CBC 731996 : if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2347 296653 : XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2348 731996 : SpinLockRelease(&XLogCtl->info_lck);
7772 tgl 2349 ECB : }
8062 tgl 2350 CBC 731996 : }
8062 tgl 2351 ECB :
5730 2352 : /*
2353 : * Record the LSN for an asynchronous transaction commit/abort
3988 2354 : * and nudge the WALWriter if there is work for it to do.
2355 : * (This should not be called for synchronous commits.)
2356 : */
2357 : void
4637 simon 2358 GIC 60542 : XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2359 : {
4165 2360 60542 : XLogRecPtr WriteRqstPtr = asyncXactLSN;
2361 : bool sleeping;
4165 simon 2362 ECB :
3121 andres 2363 GIC 60542 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 2364 CBC 60542 : LogwrtResult = XLogCtl->LogwrtResult;
3121 andres 2365 GIC 60542 : sleeping = XLogCtl->WalWriterSleeping;
2366 60542 : if (XLogCtl->asyncXactLSN < asyncXactLSN)
3121 andres 2367 CBC 60214 : XLogCtl->asyncXactLSN = asyncXactLSN;
2368 60542 : SpinLockRelease(&XLogCtl->info_lck);
4165 simon 2369 ECB :
3988 tgl 2370 : /*
2371 : * If the WALWriter is sleeping, we should kick it to make it come out of
3260 bruce 2372 : * low-power mode. Otherwise, determine whether there's a full page of
2373 : * WAL available to write.
2374 : */
3988 tgl 2375 GIC 60542 : if (!sleeping)
2376 : {
2377 : /* back off to last completed page boundary */
3941 heikki.linnakangas 2378 60508 : WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
4165 simon 2379 ECB :
2380 : /* if we have already flushed that far, we're done */
3754 alvherre 2381 GIC 60508 : if (WriteRqstPtr <= LogwrtResult.Flush)
3988 tgl 2382 CBC 17598 : return;
2383 : }
2384 :
4165 simon 2385 ECB : /*
3955 bruce 2386 : * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2387 : * to come out of low-power mode so that this async commit will reach disk
2388 : * within the expected amount of time.
2389 : */
3988 tgl 2390 GIC 42944 : if (ProcGlobal->walwriterLatch)
2391 8097 : SetLatch(ProcGlobal->walwriterLatch);
2392 : }
2393 :
3355 rhaas 2394 ECB : /*
2395 : * Record the LSN up to which we can remove WAL because it's not required by
2396 : * any replication slot.
2397 : */
2398 : void
3355 rhaas 2399 GIC 19623 : XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2400 : {
3121 andres 2401 19623 : SpinLockAcquire(&XLogCtl->info_lck);
2402 19623 : XLogCtl->replicationSlotMinLSN = lsn;
3121 andres 2403 CBC 19623 : SpinLockRelease(&XLogCtl->info_lck);
3355 rhaas 2404 GIC 19623 : }
3355 rhaas 2405 ECB :
2406 :
2407 : /*
2408 : * Return the oldest LSN we must retain to satisfy the needs of some
2409 : * replication slot.
2410 : */
2411 : static XLogRecPtr
3355 rhaas 2412 GIC 2660 : XLogGetReplicationSlotMinimumLSN(void)
2413 : {
2414 : XLogRecPtr retval;
2415 :
3121 andres 2416 CBC 2660 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 2417 GIC 2660 : retval = XLogCtl->replicationSlotMinLSN;
2418 2660 : SpinLockRelease(&XLogCtl->info_lck);
2419 :
3355 rhaas 2420 CBC 2660 : return retval;
3355 rhaas 2421 ECB : }
2422 :
2423 : /*
5163 heikki.linnakangas 2424 : * Advance minRecoveryPoint in control file.
2425 : *
2426 : * If we crash during recovery, we must reach this point again before the
2427 : * database is consistent.
2428 : *
2429 : * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2430 : * is only updated if it's not already greater than or equal to 'lsn'.
2431 : */
2432 : static void
5163 heikki.linnakangas 2433 GIC 87149 : UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2434 : {
2435 : /* Quick check using our local copy of the variable */
417 2436 87149 : if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
5163 heikki.linnakangas 2437 CBC 80958 : return;
2438 :
2439 : /*
1739 michael 2440 ECB : * An invalid minRecoveryPoint means that we need to recover all the WAL,
2441 : * i.e., we're doing crash recovery. We never modify the control file's
2442 : * value in that case, so we can short-circuit future checks here too. The
2443 : * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2444 : * updated until crash recovery finishes. We only do this for the startup
2445 : * process as it should not update its own reference of minRecoveryPoint
2446 : * until it has finished crash recovery to make sure that all WAL
2447 : * available is replayed in this case. This also saves from extra locks
2448 : * taken on the control file from the startup process.
2449 : */
417 heikki.linnakangas 2450 GIC 6191 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
2451 : {
1739 michael 2452 25 : updateMinRecoveryPoint = false;
2453 25 : return;
1739 michael 2454 ECB : }
2455 :
5163 heikki.linnakangas 2456 CBC 6166 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5163 heikki.linnakangas 2457 ECB :
2458 : /* update local copy */
417 heikki.linnakangas 2459 GIC 6166 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
417 heikki.linnakangas 2460 CBC 6166 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2461 :
417 heikki.linnakangas 2462 GIC 6166 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
1682 michael 2463 CBC 1 : updateMinRecoveryPoint = false;
417 heikki.linnakangas 2464 6165 : else if (force || LocalMinRecoveryPoint < lsn)
2465 : {
5050 bruce 2466 ECB : XLogRecPtr newMinRecoveryPoint;
3778 heikki.linnakangas 2467 : TimeLineID newMinRecoveryPointTLI;
5163 2468 :
2469 : /*
2470 : * To avoid having to update the control file too often, we update it
2471 : * all the way to the last record being replayed, even though 'lsn'
2472 : * would suffice for correctness. This also allows the 'force' case
2473 : * to not need a valid 'lsn' value.
2474 : *
2475 : * Another important reason for doing it this way is that the passed
2476 : * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2477 : * the caller got it from a corrupted heap page. Accepting such a
2478 : * value as the min recovery point would prevent us from coming up at
2479 : * all. Instead, we just log a warning and continue with recovery.
2480 : * (See also the comments about corrupt LSNs in XLogFlush.)
2481 : */
417 heikki.linnakangas 2482 GIC 5869 : newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
3754 alvherre 2483 5869 : if (!force && newMinRecoveryPoint < lsn)
5035 tgl 2484 UIC 0 : elog(WARNING,
2485 : "xlog min recovery request %X/%X is past current point %X/%X",
775 peter 2486 ECB : LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
5035 tgl 2487 :
5163 heikki.linnakangas 2488 EUB : /* update control file */
3754 alvherre 2489 GIC 5869 : if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2490 : {
5163 heikki.linnakangas 2491 5849 : ControlFile->minRecoveryPoint = newMinRecoveryPoint;
3778 2492 5849 : ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
5163 heikki.linnakangas 2493 CBC 5849 : UpdateControlFile();
417 heikki.linnakangas 2494 GIC 5849 : LocalMinRecoveryPoint = newMinRecoveryPoint;
417 heikki.linnakangas 2495 CBC 5849 : LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
5163 heikki.linnakangas 2496 ECB :
5163 heikki.linnakangas 2497 CBC 5849 : ereport(DEBUG2,
781 peter 2498 ECB : (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
417 heikki.linnakangas 2499 : LSN_FORMAT_ARGS(newMinRecoveryPoint),
2500 : newMinRecoveryPointTLI)));
5163 2501 : }
2502 : }
5163 heikki.linnakangas 2503 GIC 6166 : LWLockRelease(ControlFileLock);
2504 : }
2505 :
2506 : /*
8062 tgl 2507 ECB : * Ensure that all XLOG data through the given position is flushed to disk.
2508 : *
2509 : * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2510 : * already held, and we try to avoid acquiring it if possible.
2511 : */
2512 : void
8062 tgl 2513 GIC 1019619 : XLogFlush(XLogRecPtr record)
2514 : {
2515 : XLogRecPtr WriteRqstPtr;
2516 : XLogwrtRqst WriteRqst;
515 rhaas 2517 CBC 1019619 : TimeLineID insertTLI = XLogCtl->InsertTimeLineID;
2518 :
2519 : /*
2520 : * During REDO, we are reading not writing WAL. Therefore, instead of
4790 bruce 2521 ECB : * trying to flush the WAL, we should update minRecoveryPoint instead. We
2522 : * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2523 : * to act this way too, and because when it tries to write the
2524 : * end-of-recovery checkpoint, it should indeed flush.
2525 : */
5035 tgl 2526 GIC 1019619 : if (!XLogInsertAllowed())
2527 : {
5163 heikki.linnakangas 2528 87067 : UpdateMinRecoveryPoint(record, false);
8062 tgl 2529 714330 : return;
5163 heikki.linnakangas 2530 ECB : }
2531 :
8062 tgl 2532 : /* Quick exit if already known flushed */
3754 alvherre 2533 CBC 932552 : if (record <= LogwrtResult.Flush)
8062 tgl 2534 GIC 627263 : return;
2535 :
2536 : #ifdef WAL_DEBUG
7352 tgl 2537 ECB : if (XLOG_DEBUG)
7202 2538 : elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2539 : LSN_FORMAT_ARGS(record),
2540 : LSN_FORMAT_ARGS(LogwrtResult.Write),
2541 : LSN_FORMAT_ARGS(LogwrtResult.Flush));
2542 : #endif
2543 :
8062 tgl 2544 GIC 305289 : START_CRIT_SECTION();
2545 :
2546 : /*
2547 : * Since fsync is usually a horribly expensive operation, we try to
6385 bruce 2548 ECB : * piggyback as much data as we can on each fsync: if we see any more data
2549 : * entered into the xlog buffer, we'll write and fsync that too, so that
2550 : * the final value of LogwrtResult.Flush is as large as possible. This
2551 : * gives us some chance of avoiding another fsync immediately after.
2552 : */
2553 :
2554 : /* initialize to given target; may increase below */
8062 tgl 2555 GIC 305289 : WriteRqstPtr = record;
2556 :
2557 : /*
2558 : * Now wait until we get the write lock, or someone else does the flush
3955 bruce 2559 ECB : * for us.
2560 : */
2561 : for (;;)
7772 tgl 2562 GIC 737 : {
2563 : XLogRecPtr insertpos;
2564 :
2565 : /* read LogwrtResult and update local state */
3121 andres 2566 CBC 306026 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 2567 GIC 306026 : if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2568 13438 : WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2569 306026 : LogwrtResult = XLogCtl->LogwrtResult;
3121 andres 2570 CBC 306026 : SpinLockRelease(&XLogCtl->info_lck);
7862 tgl 2571 ECB :
4087 heikki.linnakangas 2572 : /* done already? */
3754 alvherre 2573 CBC 306026 : if (record <= LogwrtResult.Flush)
4087 heikki.linnakangas 2574 9956 : break;
2575 :
2576 : /*
3562 heikki.linnakangas 2577 ECB : * Before actually performing the write, wait for all in-flight
2578 : * insertions to the pages we're about to write to finish.
2579 : */
3562 heikki.linnakangas 2580 GIC 296070 : insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2581 :
2582 : /*
2583 : * Try to get the write lock. If we can't get it immediately, wait
4087 heikki.linnakangas 2584 ECB : * until it's released, and recheck if we still need to do the flush
2585 : * or if the backend that held the lock did it for us already. This
2586 : * helps to maintain a good rate of group committing when the system
2587 : * is bottlenecked by the speed of fsyncing.
2588 : */
4078 heikki.linnakangas 2589 GIC 296070 : if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2590 : {
2591 : /*
2592 : * The lock is now free, but we didn't acquire it yet. Before we
4087 heikki.linnakangas 2593 ECB : * do, loop back to check if someone else flushed the record for
2594 : * us already.
2595 : */
4087 heikki.linnakangas 2596 GIC 737 : continue;
2597 : }
2598 :
2599 : /* Got the lock; recheck whether request is satisfied */
4051 heikki.linnakangas 2600 CBC 295333 : LogwrtResult = XLogCtl->LogwrtResult;
3754 alvherre 2601 GIC 295333 : if (record <= LogwrtResult.Flush)
2602 : {
3933 rhaas 2603 642 : LWLockRelease(WALWriteLock);
3933 rhaas 2604 CBC 642 : break;
3933 rhaas 2605 ECB : }
2606 :
2607 : /*
2608 : * Sleep before flush! By adding a delay here, we may give further
2609 : * backends the opportunity to join the backlog of group commit
2610 : * followers; this can significantly improve transaction throughput,
2611 : * at the risk of increasing transaction latency.
2612 : *
2613 : * We do not sleep if enableFsync is not turned on, nor if there are
2614 : * fewer than CommitSiblings other backends with active transactions.
2615 : */
3933 rhaas 2616 GIC 294691 : if (CommitDelay > 0 && enableFsync &&
3933 rhaas 2617 UIC 0 : MinimumActiveBackends(CommitSiblings))
2618 : {
2619 0 : pg_usleep(CommitDelay);
3933 rhaas 2620 ECB :
3562 heikki.linnakangas 2621 EUB : /*
2622 : * Re-check how far we can now flush the WAL. It's generally not
2882 simon 2623 : * safe to call WaitXLogInsertionsToFinish while holding
2624 : * WALWriteLock, because an in-progress insertion might need to
2625 : * also grab WALWriteLock to make progress. But we know that all
2626 : * the insertions up to insertpos have already finished, because
2627 : * that's what the earlier WaitXLogInsertionsToFinish() returned.
2628 : * We're only calling it again to allow insertpos to be moved
2629 : * further forward, not to actually wait for anyone.
2630 : */
3562 heikki.linnakangas 2631 UIC 0 : insertpos = WaitXLogInsertionsToFinish(insertpos);
2632 : }
2633 :
2634 : /* try to write/flush later additions to XLOG as well */
3562 heikki.linnakangas 2635 GBC 294691 : WriteRqst.Write = insertpos;
3562 heikki.linnakangas 2636 GIC 294691 : WriteRqst.Flush = insertpos;
2637 :
520 rhaas 2638 294691 : XLogWrite(WriteRqst, insertTLI, false);
3933 rhaas 2639 ECB :
7862 tgl 2640 CBC 294691 : LWLockRelease(WALWriteLock);
2641 : /* done */
4087 heikki.linnakangas 2642 294691 : break;
2643 : }
8062 tgl 2644 ECB :
8062 tgl 2645 GIC 305289 : END_CRIT_SECTION();
7755 tgl 2646 ECB :
2647 : /* wake up walsenders now that we've released heavily contended locks */
1 andres 2648 GNC 305289 : WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3933 rhaas 2649 ECB :
2650 : /*
2651 : * If we still haven't flushed to the request point then we have a
6385 bruce 2652 : * problem; most likely, the requested flush point is past end of XLOG.
2653 : * This has been seen to occur when a disk page has a corrupted LSN.
2654 : *
2655 : * Formerly we treated this as a PANIC condition, but that hurts the
2656 : * system's robustness rather than helping it: we do not want to take down
2657 : * the whole system due to corruption on one data page. In particular, if
2658 : * the bad page is encountered again during recovery then we would be
2659 : * unable to restart the database at all! (This scenario actually
2660 : * happened in the field several times with 7.1 releases.) As of 8.4, bad
2661 : * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2662 : * the only time we can reach here during recovery is while flushing the
2663 : * end-of-recovery checkpoint record, and we don't expect that to have a
2664 : * bad LSN.
2665 : *
2666 : * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2667 : * since xact.c calls this routine inside a critical section. However,
2668 : * calls from bufmgr.c are not within critical sections and so we will not
2669 : * force a restart for a bad LSN on a data page.
2670 : */
3754 alvherre 2671 GIC 305289 : if (LogwrtResult.Flush < record)
5035 tgl 2672 UIC 0 : elog(ERROR,
2673 : "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2674 : LSN_FORMAT_ARGS(record),
775 peter 2675 ECB : LSN_FORMAT_ARGS(LogwrtResult.Flush));
8595 vadim4o 2676 EUB : }
2677 :
2678 : /*
2679 : * Write & flush xlog, but without specifying exactly where to.
2680 : *
2681 : * We normally write only completed blocks; but if there is nothing to do on
2682 : * that basis, we check for unwritten async commits in the current incomplete
2683 : * block, and write through the latest one of those. Thus, if async commits
2684 : * are not being used, we will write complete blocks only.
2685 : *
2686 : * If, based on the above, there's anything to write we do so immediately. But
2687 : * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2688 : * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2689 : * more than wal_writer_flush_after unflushed blocks.
2690 : *
2691 : * We can guarantee that async commits reach disk after at most three
2692 : * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2693 : * to write "flexibly", meaning it can stop at the end of the buffer ring;
2694 : * this makes a difference only with very high load or long wal_writer_delay,
2695 : * but imposes one extra cycle for the worst case for async commits.)
2696 : *
2697 : * This routine is invoked periodically by the background walwriter process.
2698 : *
2699 : * Returns true if there was any work to do, even if we skipped flushing due
2700 : * to wal_writer_delay/wal_writer_flush_after.
2701 : */
2702 : bool
5738 tgl 2703 GIC 14942 : XLogBackgroundFlush(void)
2704 : {
2705 : XLogwrtRqst WriteRqst;
2706 14942 : bool flexible = true;
2610 andres 2707 ECB : static TimestampTz lastflush;
2708 : TimestampTz now;
2709 : int flushbytes;
520 rhaas 2710 : TimeLineID insertTLI;
2711 :
2712 : /* XLOG doesn't need flushing during recovery */
5163 heikki.linnakangas 2713 GIC 14942 : if (RecoveryInProgress())
3988 tgl 2714 8 : return false;
2715 :
2716 : /*
515 rhaas 2717 ECB : * Since we're not in recovery, InsertTimeLineID is set and can't change,
520 2718 : * so we can read it without a lock.
2719 : */
515 rhaas 2720 GIC 14934 : insertTLI = XLogCtl->InsertTimeLineID;
2721 :
2722 : /* read LogwrtResult and update local state */
3121 andres 2723 14934 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 2724 CBC 14934 : LogwrtResult = XLogCtl->LogwrtResult;
2610 andres 2725 GIC 14934 : WriteRqst = XLogCtl->LogwrtRqst;
3121 2726 14934 : SpinLockRelease(&XLogCtl->info_lck);
5738 tgl 2727 ECB :
2728 : /* back off to last completed page boundary */
2610 andres 2729 CBC 14934 : WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
5738 tgl 2730 ECB :
2731 : /* if we have already flushed that far, consider async commit records */
2610 andres 2732 GIC 14934 : if (WriteRqst.Write <= LogwrtResult.Flush)
5738 tgl 2733 ECB : {
3121 andres 2734 GIC 8201 : SpinLockAcquire(&XLogCtl->info_lck);
2610 2735 8201 : WriteRqst.Write = XLogCtl->asyncXactLSN;
3121 andres 2736 CBC 8201 : SpinLockRelease(&XLogCtl->info_lck);
5738 tgl 2737 GIC 8201 : flexible = false; /* ensure it all gets written */
5738 tgl 2738 ECB : }
2739 :
4687 magnus 2740 : /*
4660 bruce 2741 : * If already known flushed, we're done. Just need to check if we are
2742 : * holding an open file handle to a logfile that's no longer in use,
2743 : * preventing the file from being deleted.
2744 : */
2610 andres 2745 GIC 14934 : if (WriteRqst.Write <= LogwrtResult.Flush)
2746 : {
4660 bruce 2747 7725 : if (openLogFile >= 0)
2748 : {
2028 andres 2749 CBC 4361 : if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2750 : wal_segment_size))
4687 magnus 2751 ECB : {
4687 magnus 2752 GIC 79 : XLogFileClose();
4687 magnus 2753 ECB : }
2754 : }
3988 tgl 2755 GIC 7725 : return false;
4687 magnus 2756 ECB : }
2757 :
2758 : /*
2610 andres 2759 : * Determine how far to flush WAL, based on the wal_writer_delay and
2760 : * wal_writer_flush_after GUCs.
2761 : */
2610 andres 2762 GIC 7209 : now = GetCurrentTimestamp();
2763 7209 : flushbytes =
2764 7209 : WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2765 :
2610 andres 2766 CBC 7209 : if (WalWriterFlushAfter == 0 || lastflush == 0)
2610 andres 2767 ECB : {
2768 : /* first call, or block based limits disabled */
2610 andres 2769 GIC 175 : WriteRqst.Flush = WriteRqst.Write;
2610 andres 2770 CBC 175 : lastflush = now;
2771 : }
2610 andres 2772 GIC 7034 : else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
2610 andres 2773 ECB : {
2774 : /*
2775 : * Flush the writes at least every WalWriterDelay ms. This is
1329 michael 2776 : * important to bound the amount of time it takes for an asynchronous
2777 : * commit to hit disk.
2778 : */
2610 andres 2779 GIC 1705 : WriteRqst.Flush = WriteRqst.Write;
2780 1705 : lastflush = now;
2781 : }
2782 5329 : else if (flushbytes >= WalWriterFlushAfter)
2610 andres 2783 ECB : {
2784 : /* exceeded wal_writer_flush_after blocks, flush */
2610 andres 2785 GIC 3 : WriteRqst.Flush = WriteRqst.Write;
2610 andres 2786 CBC 3 : lastflush = now;
2787 : }
2788 : else
2610 andres 2789 ECB : {
2790 : /* no flushing, this time round */
2610 andres 2791 GIC 5326 : WriteRqst.Flush = 0;
2792 : }
2793 :
2794 : #ifdef WAL_DEBUG
5738 tgl 2795 ECB : if (XLOG_DEBUG)
2796 : elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
2797 : LSN_FORMAT_ARGS(WriteRqst.Write),
2798 : LSN_FORMAT_ARGS(WriteRqst.Flush),
2799 : LSN_FORMAT_ARGS(LogwrtResult.Write),
2800 : LSN_FORMAT_ARGS(LogwrtResult.Flush));
2801 : #endif
2802 :
5738 tgl 2803 GIC 7209 : START_CRIT_SECTION();
2804 :
2805 : /* now wait for any in-progress insertions to finish and get write lock */
2610 andres 2806 7209 : WaitXLogInsertionsToFinish(WriteRqst.Write);
5738 tgl 2807 CBC 7209 : LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
4051 heikki.linnakangas 2808 GIC 7209 : LogwrtResult = XLogCtl->LogwrtResult;
2610 andres 2809 7209 : if (WriteRqst.Write > LogwrtResult.Write ||
2610 andres 2810 CBC 2366 : WriteRqst.Flush > LogwrtResult.Flush)
5738 tgl 2811 ECB : {
520 rhaas 2812 CBC 4879 : XLogWrite(WriteRqst, insertTLI, flexible);
5738 tgl 2813 ECB : }
5738 tgl 2814 CBC 7209 : LWLockRelease(WALWriteLock);
2815 :
2816 7209 : END_CRIT_SECTION();
2817 :
3933 rhaas 2818 ECB : /* wake up walsenders now that we've released heavily contended locks */
1 andres 2819 GNC 7209 : WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3958 simon 2820 ECB :
2821 : /*
2822 : * Great, done. To take some work off the critical path, try to initialize
3562 heikki.linnakangas 2823 : * as many of the no-longer-needed WAL buffers for future use as we can.
2824 : */
520 rhaas 2825 GIC 7209 : AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
2826 :
2827 : /*
2828 : * If we determined that we need to write data, but somebody else
2610 andres 2829 ECB : * wrote/flushed already, it should be considered as being active, to
2830 : * avoid hibernating too early.
2831 : */
2610 andres 2832 GIC 7209 : return true;
2833 : }
2834 :
2835 : /*
5793 tgl 2836 ECB : * Test whether XLOG data has been flushed up to (at least) the given position.
2837 : *
2838 : * Returns true if a flush is still needed. (It may be that someone else
2839 : * is already in process of flushing that far, however.)
2840 : */
2841 : bool
5793 tgl 2842 GIC 14946151 : XLogNeedsFlush(XLogRecPtr record)
2843 : {
2844 : /*
2845 : * During recovery, we don't flush WAL but update minRecoveryPoint
4859 simon 2846 ECB : * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2847 : * would need to be updated.
2848 : */
5163 heikki.linnakangas 2849 GIC 14946151 : if (RecoveryInProgress())
2850 : {
2851 : /*
2852 : * An invalid minRecoveryPoint means that we need to recover all the
1739 michael 2853 ECB : * WAL, i.e., we're doing crash recovery. We never modify the control
2854 : * file's value in that case, so we can short-circuit future checks
2855 : * here too. This triggers a quick exit path for the startup process,
2856 : * which cannot update its local copy of minRecoveryPoint as long as
2857 : * it has not replayed all WAL available when doing crash recovery.
2858 : */
417 heikki.linnakangas 2859 GIC 1407027 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
1739 michael 2860 UIC 0 : updateMinRecoveryPoint = false;
2861 :
2862 : /* Quick exit if already known to be updated or cannot be updated */
417 heikki.linnakangas 2863 CBC 1407027 : if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
4859 simon 2864 GBC 1392862 : return false;
2865 :
2866 : /*
4859 simon 2867 ECB : * Update local copy of minRecoveryPoint. But if the lock is busy,
2868 : * just return a conservative guess.
2869 : */
4859 simon 2870 GIC 14165 : if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
4859 simon 2871 UIC 0 : return true;
417 heikki.linnakangas 2872 GIC 14165 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
2873 14165 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4859 simon 2874 CBC 14165 : LWLockRelease(ControlFileLock);
4859 simon 2875 EUB :
1682 michael 2876 ECB : /*
2877 : * Check minRecoveryPoint for any other process than the startup
2878 : * process doing crash recovery, which should not update the control
2879 : * file value if crash recovery is still running.
2880 : */
417 heikki.linnakangas 2881 GIC 14165 : if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
1682 michael 2882 UIC 0 : updateMinRecoveryPoint = false;
2883 :
2884 : /* check again */
417 heikki.linnakangas 2885 CBC 14165 : if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
1682 michael 2886 GBC 77 : return false;
2887 : else
1682 michael 2888 GIC 14088 : return true;
4859 simon 2889 ECB : }
5163 heikki.linnakangas 2890 :
2891 : /* Quick exit if already known flushed */
3754 alvherre 2892 CBC 13539124 : if (record <= LogwrtResult.Flush)
5793 tgl 2893 GIC 13431597 : return false;
2894 :
2895 : /* read LogwrtResult and update local state */
3121 andres 2896 CBC 107527 : SpinLockAcquire(&XLogCtl->info_lck);
2897 107527 : LogwrtResult = XLogCtl->LogwrtResult;
3121 andres 2898 GIC 107527 : SpinLockRelease(&XLogCtl->info_lck);
2899 :
5793 tgl 2900 ECB : /* check again */
3754 alvherre 2901 CBC 107527 : if (record <= LogwrtResult.Flush)
5793 tgl 2902 2195 : return false;
2903 :
5793 tgl 2904 GIC 105332 : return true;
5793 tgl 2905 ECB : }
2906 :
2907 : /*
650 noah 2908 : * Try to make a given XLOG file segment exist.
2909 : *
2910 : * logsegno: identify segment.
2911 : *
2912 : * *added: on return, true if this call raised the number of extant segments.
2913 : *
2914 : * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
2915 : *
2916 : * Returns -1 or FD of opened file. A -1 here is not an error; a caller
2917 : * wanting an open segment should attempt to open "path", which usually will
2918 : * succeed. (This is weird, but it's efficient for the callers.)
2919 : */
2920 : static int
520 rhaas 2921 GIC 7666 : XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
2922 : bool *added, char *path)
2923 : {
2924 : char tmppath[MAXPGPATH];
2925 : XLogSegNo installed_segno;
2926 : XLogSegNo max_segno;
2927 : int fd;
2928 : int save_errno;
1 tmunro 2929 GNC 7666 : int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
2930 :
520 rhaas 2931 GIC 7666 : Assert(logtli != 0);
2932 :
520 rhaas 2933 CBC 7666 : XLogFilePath(path, logtli, logsegno, wal_segment_size);
2934 :
8190 vadim4o 2935 ECB : /*
2936 : * Try to use existent file (checkpoint maker may have created it already)
2937 : */
650 noah 2938 GIC 7666 : *added = false;
37 tmunro 2939 GNC 7666 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
2940 7666 : get_sync_bit(sync_method));
650 noah 2941 GIC 7666 : if (fd < 0)
2942 : {
650 noah 2943 CBC 684 : if (errno != ENOENT)
650 noah 2944 LBC 0 : ereport(ERROR,
650 noah 2945 ECB : (errcode_for_file_access(),
2946 : errmsg("could not open file \"%s\": %m", path)));
2947 : }
2948 : else
650 noah 2949 GBC 6982 : return fd;
2950 :
2951 : /*
2952 : * Initialize an empty (all zeroes) segment. NOTE: it is possible that
2953 : * another process is doing the same thing. If so, we will end up
6385 bruce 2954 ECB : * pre-creating an extra log segment. That seems OK, and better than
2955 : * holding the lock throughout this lengthy process.
2956 : */
5762 tgl 2957 GIC 684 : elog(DEBUG2, "creating and filling new WAL file");
2958 :
6488 2959 684 : snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2960 :
8058 2961 684 : unlink(tmppath);
8595 vadim4o 2962 ECB :
1 tmunro 2963 GNC 684 : if (io_direct_flags & IO_DIRECT_WAL_INIT)
1 tmunro 2964 UNC 0 : open_flags |= PG_O_DIRECT;
2965 :
2966 : /* do not use get_sync_bit() here --- want to fsync only at end of fill */
1 tmunro 2967 GNC 684 : fd = BasicOpenFile(tmppath, open_flags);
8595 vadim4o 2968 GIC 684 : if (fd < 0)
6568 tgl 2969 LBC 0 : ereport(ERROR,
2970 : (errcode_for_file_access(),
7136 peter_e 2971 ECB : errmsg("could not create file \"%s\": %m", tmppath)));
8595 vadim4o 2972 EUB :
1468 tmunro 2973 CBC 684 : pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
2974 684 : save_errno = 0;
1468 tmunro 2975 GBC 684 : if (wal_init_zero)
2976 : {
2977 : ssize_t rc;
818 tmunro 2978 ECB :
1468 2979 : /*
2980 : * Zero-fill the file. With this setting, we do this the hard way to
2981 : * ensure that all the file space has really been allocated. On
2982 : * platforms that allow "holes" in files, just seeking to the end
2983 : * doesn't allocate intermediate space. This way, we know that we
2984 : * have all the space and (after the fsync below) that all the
2985 : * indirect blocks are down on disk. Therefore, fdatasync(2) or
2986 : * O_DSYNC will be sufficient to sync future writes to the log file.
2987 : */
34 michael 2988 GNC 684 : rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
2989 :
152 2990 684 : if (rc < 0)
152 michael 2991 UNC 0 : save_errno = errno;
1468 tmunro 2992 ECB : }
2993 : else
2994 : {
2995 : /*
2996 : * Otherwise, seeking to the end and writing a solitary byte is
2997 : * enough.
2998 : */
3504 jdavis 2999 UBC 0 : errno = 0;
152 michael 3000 UNC 0 : if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
3565 jdavis 3001 EUB : {
3002 : /* if write didn't set errno, assume no disk space */
1468 tmunro 3003 UBC 0 : save_errno = errno ? errno : ENOSPC;
3004 : }
1468 tmunro 3005 EUB : }
1468 tmunro 3006 GIC 684 : pgstat_report_wait_end();
3007 :
3008 684 : if (save_errno)
3009 : {
1468 tmunro 3010 ECB : /*
3011 : * If we fail to make the file, delete it to release disk space
3012 : */
1468 tmunro 3013 UBC 0 : unlink(tmppath);
3565 jdavis 3014 EUB :
1468 tmunro 3015 UBC 0 : close(fd);
3565 jdavis 3016 EUB :
1468 tmunro 3017 UIC 0 : errno = save_errno;
3018 :
3019 0 : ereport(ERROR,
1468 tmunro 3020 ECB : (errcode_for_file_access(),
3021 : errmsg("could not write to file \"%s\": %m", tmppath)));
8085 tgl 3022 : }
8595 vadim4o 3023 EUB :
2213 rhaas 3024 GIC 684 : pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
8157 tgl 3025 684 : if (pg_fsync(fd) != 0)
3026 : {
226 drowley 3027 UNC 0 : save_errno = errno;
3785 heikki.linnakangas 3028 UIC 0 : close(fd);
1749 michael 3029 0 : errno = save_errno;
6568 tgl 3030 0 : ereport(ERROR,
7202 tgl 3031 ECB : (errcode_for_file_access(),
3032 : errmsg("could not fsync file \"%s\": %m", tmppath)));
3033 : }
2213 rhaas 3034 GIC 684 : pgstat_report_wait_end();
3035 :
1373 peter 3036 684 : if (close(fd) != 0)
6568 tgl 3037 UIC 0 : ereport(ERROR,
3038 : (errcode_for_file_access(),
3039 : errmsg("could not close file \"%s\": %m", tmppath)));
3040 :
3041 : /*
650 noah 3042 ECB : * Now move the segment into place with its final name. Cope with
3043 : * possibility that someone else has created the file while we were
3044 : * filling ours: if so, use ours to pre-create a future log segment.
3045 : */
3941 heikki.linnakangas 3046 CBC 684 : installed_segno = logsegno;
2967 heikki.linnakangas 3047 ECB :
3048 : /*
3049 : * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3050 : * that was a constant, but that was always a bit dubious: normally, at a
3051 : * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3052 : * here, it was the offset from the insert location. We can't do the
3053 : * normal XLOGfileslop calculation here because we don't have access to
3054 : * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3055 : * CheckPointSegments.
2967 heikki.linnakangas 3056 EUB : */
2967 heikki.linnakangas 3057 GBC 684 : max_segno = logsegno + CheckPointSegments;
520 rhaas 3058 GIC 684 : if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
3059 : logtli))
650 noah 3060 ECB : {
650 noah 3061 GIC 684 : *added = true;
3062 684 : elog(DEBUG2, "done creating and filling new WAL file");
3063 : }
3064 : else
3065 : {
3066 : /*
3067 : * No need for any more future segments, or InstallXLogFileSegment()
3068 : * failed to rename the file into place. If the rename failed, a
3069 : * caller opening the file may fail.
3070 : */
7934 tgl 3071 UIC 0 : unlink(tmppath);
650 noah 3072 0 : elog(DEBUG2, "abandoned new WAL file");
3073 : }
3074 :
650 noah 3075 GIC 684 : return -1;
650 noah 3076 ECB : }
3077 :
3078 : /*
3079 : * Create a new XLOG file segment, or open a pre-existing one.
3080 : *
3081 : * logsegno: identify segment to be created/opened.
3082 : *
3083 : * Returns FD of opened file.
3084 : *
3085 : * Note: errors here are ERROR not PANIC because we might or might not be
3086 : * inside a critical section (eg, during checkpoint there is no reason to
3087 : * take down the system on failure). They will promote to PANIC if we are
3088 : * in a critical section.
3089 : */
3090 : int
520 rhaas 3091 CBC 7624 : XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
650 noah 3092 EUB : {
3093 : bool ignore_added;
3094 : char path[MAXPGPATH];
650 noah 3095 ECB : int fd;
3096 :
520 rhaas 3097 GIC 7624 : Assert(logtli != 0);
3098 :
3099 7624 : fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
650 noah 3100 7624 : if (fd >= 0)
3101 6949 : return fd;
3102 :
3103 : /* Now open original target segment (might not be file I just made) */
37 tmunro 3104 GNC 675 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3105 675 : get_sync_bit(sync_method));
7934 tgl 3106 GIC 675 : if (fd < 0)
6568 tgl 3107 UIC 0 : ereport(ERROR,
3108 : (errcode_for_file_access(),
3109 : errmsg("could not open file \"%s\": %m", path)));
6297 neilc 3110 GIC 675 : return fd;
3111 : }
3112 :
3113 : /*
3114 : * Create a new XLOG file segment by copying a pre-existing one.
6836 tgl 3115 ECB : *
3116 : * destsegno: identify segment to be created.
3117 : *
3118 : * srcTLI, srcsegno: identify segment to be copied (could be from
3119 : * a different timeline)
3120 : *
3121 : * upto: how much of the source file to copy (the rest is filled with
3122 : * zeros)
3123 : *
3124 : * Currently this is only used during recovery, and so there are no locking
3125 : * considerations. But we should be just as tense as XLogFileInit to avoid
3126 : * emplacing a bogus file.
3127 : */
3128 : static void
520 rhaas 3129 CBC 30 : XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
520 rhaas 3130 ECB : TimeLineID srcTLI, XLogSegNo srcsegno,
2839 fujii 3131 : int upto)
6836 tgl 3132 EUB : {
3133 : char path[MAXPGPATH];
3134 : char tmppath[MAXPGPATH];
3135 : PGAlignedXLogBlock buffer;
3136 : int srcfd;
3137 : int fd;
3138 : int nbytes;
6836 tgl 3139 ECB :
3140 : /*
3141 : * Open the source file
3142 : */
2028 andres 3143 GIC 30 : XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
2024 peter_e 3144 CBC 30 : srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
6836 tgl 3145 30 : if (srcfd < 0)
6568 tgl 3146 UBC 0 : ereport(ERROR,
3147 : (errcode_for_file_access(),
3148 : errmsg("could not open file \"%s\": %m", path)));
3149 :
3150 : /*
3151 : * Copy into a temp file name.
3152 : */
6488 tgl 3153 CBC 30 : snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3154 :
6836 tgl 3155 GIC 30 : unlink(tmppath);
3156 :
5443 magnus 3157 ECB : /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2024 peter_e 3158 GIC 30 : fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
6836 tgl 3159 30 : if (fd < 0)
6568 tgl 3160 UIC 0 : ereport(ERROR,
3161 : (errcode_for_file_access(),
3162 : errmsg("could not create file \"%s\": %m", tmppath)));
6836 tgl 3163 ECB :
3164 : /*
3165 : * Do the data copying.
3166 : */
2028 andres 3167 GIC 61470 : for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3168 : {
3169 : int nread;
3034 heikki.linnakangas 3170 ECB :
3034 heikki.linnakangas 3171 CBC 61440 : nread = upto - nbytes;
3034 heikki.linnakangas 3172 ECB :
3173 : /*
2878 bruce 3174 : * The part that is not read from the source file is filled with
3175 : * zeros.
3034 heikki.linnakangas 3176 EUB : */
3034 heikki.linnakangas 3177 GBC 61440 : if (nread < sizeof(buffer))
1681 tgl 3178 GIC 30 : memset(buffer.data, 0, sizeof(buffer));
3179 :
3034 heikki.linnakangas 3180 61440 : if (nread > 0)
3181 : {
1726 michael 3182 EUB : int r;
3183 :
3034 heikki.linnakangas 3184 GIC 2603 : if (nread > sizeof(buffer))
3185 2573 : nread = sizeof(buffer);
2213 rhaas 3186 2603 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
1681 tgl 3187 CBC 2603 : r = read(srcfd, buffer.data, nread);
1726 michael 3188 GIC 2603 : if (r != nread)
3034 heikki.linnakangas 3189 ECB : {
1726 michael 3190 LBC 0 : if (r < 0)
3034 heikki.linnakangas 3191 0 : ereport(ERROR,
3192 : (errcode_for_file_access(),
2893 heikki.linnakangas 3193 EUB : errmsg("could not read file \"%s\": %m",
3194 : path)));
3195 : else
3034 heikki.linnakangas 3196 UIC 0 : ereport(ERROR,
3197 : (errcode(ERRCODE_DATA_CORRUPTED),
1721 michael 3198 EUB : errmsg("could not read file \"%s\": read %d of %zu",
3199 : path, r, (Size) nread)));
3034 heikki.linnakangas 3200 : }
2213 rhaas 3201 GIC 2603 : pgstat_report_wait_end();
6836 tgl 3202 EUB : }
6836 tgl 3203 GIC 61440 : errno = 0;
2213 rhaas 3204 61440 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
1681 tgl 3205 61440 : if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
6836 tgl 3206 ECB : {
6836 tgl 3207 UIC 0 : int save_errno = errno;
3208 :
6836 tgl 3209 ECB : /*
6385 bruce 3210 : * If we fail to make the file, delete it to release disk space
6836 tgl 3211 EUB : */
6836 tgl 3212 UIC 0 : unlink(tmppath);
3213 : /* if write didn't set errno, assume problem is no disk space */
6836 tgl 3214 LBC 0 : errno = save_errno ? save_errno : ENOSPC;
3215 :
6568 3216 0 : ereport(ERROR,
6836 tgl 3217 EUB : (errcode_for_file_access(),
3218 : errmsg("could not write to file \"%s\": %m", tmppath)));
3219 : }
2213 rhaas 3220 GIC 61440 : pgstat_report_wait_end();
6836 tgl 3221 ECB : }
6836 tgl 3222 EUB :
2213 rhaas 3223 GIC 30 : pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
6836 tgl 3224 30 : if (pg_fsync(fd) != 0)
1602 tmunro 3225 UIC 0 : ereport(data_sync_elevel(ERROR),
3226 : (errcode_for_file_access(),
3227 : errmsg("could not fsync file \"%s\": %m", tmppath)));
2213 rhaas 3228 GIC 30 : pgstat_report_wait_end();
6836 tgl 3229 ECB :
1373 peter 3230 GBC 30 : if (CloseTransientFile(fd) != 0)
6568 tgl 3231 LBC 0 : ereport(ERROR,
3232 : (errcode_for_file_access(),
3233 : errmsg("could not close file \"%s\": %m", tmppath)));
3234 :
1373 peter 3235 GIC 30 : if (CloseTransientFile(srcfd) != 0)
1492 michael 3236 UIC 0 : ereport(ERROR,
3237 : (errcode_for_file_access(),
3238 : errmsg("could not close file \"%s\": %m", path)));
3239 :
3240 : /*
3241 : * Now move the segment into place with its final name.
3242 : */
520 rhaas 3243 GIC 30 : if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
2839 fujii 3244 UIC 0 : elog(ERROR, "InstallXLogFileSegment should not have failed");
6836 tgl 3245 GIC 30 : }
3246 :
3247 : /*
3248 : * Install a new XLOG segment file as a current or future log segment.
3249 : *
3250 : * This is used both to install a newly-created segment (which has a temp
3251 : * filename while it's being created) and to recycle an old segment.
3252 : *
3253 : * *segno: identify segment to install as (or first possible target).
3254 : * When find_free is true, this is modified on return to indicate the
3255 : * actual installation location or last segment searched.
3256 : *
3257 : * tmppath: initial name of file to install. It will be renamed into place.
3258 : *
3259 : * find_free: if true, install the new segment at the first empty segno
2062 peter_e 3260 ECB : * number at or after the passed numbers. If false, install the new segment
3261 : * exactly where specified, deleting any existing segment file there.
3262 : *
3263 : * max_segno: maximum segment number to install the new file as. Fail if no
3264 : * free slot is found between *segno and max_segno. (Ignored when find_free
3265 : * is false.)
7934 tgl 3266 : *
3267 : * tli: The timeline on which the new segment should be installed.
520 rhaas 3268 : *
3269 : * Returns true if the file was installed successfully. false indicates that
650 noah 3270 : * max_segno limit was exceeded, the startup process has disabled this
3271 : * function for now, or an error occurred while renaming the file into place.
3272 : */
7934 tgl 3273 EUB : static bool
3941 heikki.linnakangas 3274 GBC 1267 : InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3275 : bool find_free, XLogSegNo max_segno, TimeLineID tli)
3276 : {
7934 tgl 3277 ECB : char path[MAXPGPATH];
3278 : struct stat stat_buf;
3279 :
520 rhaas 3280 CBC 1267 : Assert(tli != 0);
3281 :
520 rhaas 3282 GIC 1267 : XLogFilePath(path, tli, *segno, wal_segment_size);
3283 :
650 noah 3284 1267 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
650 noah 3285 CBC 1267 : if (!XLogCtl->InstallXLogFileSegmentActive)
3286 : {
650 noah 3287 LBC 0 : LWLockRelease(ControlFileLock);
650 noah 3288 UIC 0 : return false;
3289 : }
8058 tgl 3290 ECB :
7934 tgl 3291 CBC 1267 : if (!find_free)
3292 : {
7934 tgl 3293 ECB : /* Force installation: get rid of any pre-existing segment file */
2204 teodor 3294 CBC 30 : durable_unlink(path, DEBUG1);
3295 : }
3296 : else
3297 : {
7934 tgl 3298 ECB : /* Find a free slot to put it in */
7379 tgl 3299 CBC 1643 : while (stat(path, &stat_buf) == 0)
3300 : {
2967 heikki.linnakangas 3301 GBC 426 : if ((*segno) >= max_segno)
3302 : {
7934 tgl 3303 EUB : /* Failed to find a free slot within specified range */
650 noah 3304 GIC 20 : LWLockRelease(ControlFileLock);
7934 tgl 3305 20 : return false;
7934 tgl 3306 ECB : }
3941 heikki.linnakangas 3307 GIC 406 : (*segno)++;
520 rhaas 3308 CBC 406 : XLogFilePath(path, tli, *segno, wal_segment_size);
3309 : }
3310 : }
3311 :
278 michael 3312 GNC 1247 : Assert(access(path, F_OK) != 0 && errno == ENOENT);
3313 1247 : if (durable_rename(tmppath, path, LOG) != 0)
3314 : {
650 noah 3315 UIC 0 : LWLockRelease(ControlFileLock);
3316 : /* durable_rename already emitted log message */
4956 heikki.linnakangas 3317 LBC 0 : return false;
3318 : }
8190 vadim4o 3319 ECB :
650 noah 3320 CBC 1247 : LWLockRelease(ControlFileLock);
8058 tgl 3321 ECB :
7934 tgl 3322 GBC 1247 : return true;
3323 : }
3324 :
3325 : /*
6836 tgl 3326 ECB : * Open a pre-existing logfile segment for writing.
3327 : */
3328 : int
520 rhaas 3329 GIC 7 : XLogFileOpen(XLogSegNo segno, TimeLineID tli)
3330 : {
3331 : char path[MAXPGPATH];
3332 : int fd;
8595 vadim4o 3333 ECB :
520 rhaas 3334 GIC 7 : XLogFilePath(path, tli, segno, wal_segment_size);
8595 vadim4o 3335 ECB :
37 tmunro 3336 GNC 7 : fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3337 7 : get_sync_bit(sync_method));
8595 vadim4o 3338 GIC 7 : if (fd < 0)
7202 tgl 3339 UIC 0 : ereport(PANIC,
3340 : (errcode_for_file_access(),
3341 : errmsg("could not open file \"%s\": %m", path)));
3342 :
6836 tgl 3343 GIC 7 : return fd;
3344 : }
6836 tgl 3345 ECB :
6142 bruce 3346 : /*
3347 : * Close the current logfile segment for writing.
3348 : */
3349 : static void
6142 bruce 3350 GIC 1705 : XLogFileClose(void)
3351 : {
6142 bruce 3352 GBC 1705 : Assert(openLogFile >= 0);
3353 :
6142 bruce 3354 EUB : /*
6139 tgl 3355 : * WAL segment files will not be re-read in normal operation, so we advise
3260 bruce 3356 : * the OS to release any cached pages. But do not do so if WAL archiving
3357 : * or streaming is active, because archiver and walsender process could
3358 : * use the cache to read the WAL segment.
3359 : */
3360 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
1 tmunro 3361 GNC 1705 : if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
5201 tgl 3362 CBC 1257 : (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
6142 bruce 3363 ECB : #endif
3364 :
1373 peter 3365 GIC 1705 : if (close(openLogFile) != 0)
3366 : {
3367 : char xlogfname[MAXFNAMELEN];
1223 michael 3368 UIC 0 : int save_errno = errno;
3369 :
520 rhaas 3370 0 : XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
1223 michael 3371 0 : errno = save_errno;
6142 bruce 3372 0 : ereport(PANIC,
3373 : (errcode_for_file_access(),
3374 : errmsg("could not close file \"%s\": %m", xlogfname)));
3375 : }
3376 :
6142 bruce 3377 GIC 1705 : openLogFile = -1;
1140 tgl 3378 1705 : ReleaseExternalFD();
6142 bruce 3379 1705 : }
3380 :
3381 : /*
3382 : * Preallocate log files beyond the specified log endpoint.
3383 : *
3384 : * XXX this is currently extremely conservative, since it forces only one
5764 tgl 3385 ECB : * future log segment to exist, and even that only if we are 75% done with
3386 : * the current one. This is only appropriate for very low-WAL-volume systems.
3387 : * High-volume systems will be OK once they've built up a sufficient set of
3388 : * recycled log segments, but the startup transient is likely to include
3389 : * a lot of segment creations by foreground processes, which is not so good.
3390 : *
3391 : * XLogFileInitInternal() can ereport(ERROR). All known causes indicate big
3392 : * trouble; for example, a full filesystem is one cause. The checkpoint WAL
650 noah 3393 : * and/or ControlFile updates already completed. If a RequestCheckpoint()
3394 : * initiated the present checkpoint and an ERROR ends this function, the
3395 : * command that called RequestCheckpoint() fails. That's not ideal, but it's
3396 : * not worth contorting more functions to use caller-specified elevel values.
3397 : * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
3398 : * reporting and resource reclamation.)
3399 : */
5762 tgl 3400 : static void
520 rhaas 3401 CBC 2538 : PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
8062 tgl 3402 ECB : {
3941 heikki.linnakangas 3403 : XLogSegNo _logSegNo;
8062 tgl 3404 : int lf;
650 noah 3405 : bool added;
3406 : char path[MAXPGPATH];
3407 : uint64 offset;
3408 :
650 noah 3409 GIC 2538 : if (!XLogCtl->InstallXLogFileSegmentActive)
3410 4 : return; /* unlocked check says no */
3411 :
2028 andres 3412 2534 : XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3413 2534 : offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3414 2534 : if (offset >= (uint32) (0.75 * wal_segment_size))
3415 : {
3941 heikki.linnakangas 3416 42 : _logSegNo++;
520 rhaas 3417 42 : lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
650 noah 3418 42 : if (lf >= 0)
3419 33 : close(lf);
3420 42 : if (added)
5762 tgl 3421 9 : CheckpointStats.ckpt_segs_added++;
8062 tgl 3422 ECB : }
3423 : }
3424 :
3425 : /*
3426 : * Throws an error if the given log segment has already been removed or
3748 heikki.linnakangas 3427 : * recycled. The caller should only pass a segment that it knows to have
3428 : * existed while the server has been running, as this function always
3429 : * succeeds if no WAL segments have been removed since startup.
3430 : * 'tli' is only used in the error message.
1952 tgl 3431 : *
3432 : * Note: this function guarantees to keep errno unchanged on return.
3433 : * This supports callers that use this to possibly deliver a better
3434 : * error message about a missing file, while still being able to throw
1952 tgl 3435 EUB : * a normal file-access error afterwards, if this does return.
4745 heikki.linnakangas 3436 : */
3437 : void
3748 heikki.linnakangas 3438 GIC 47944 : CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3439 : {
1952 tgl 3440 47944 : int save_errno = errno;
3441 : XLogSegNo lastRemovedSegNo;
4745 heikki.linnakangas 3442 ECB :
3121 andres 3443 CBC 47944 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 3444 GIC 47944 : lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3445 47944 : SpinLockRelease(&XLogCtl->info_lck);
3446 :
3748 heikki.linnakangas 3447 47944 : if (segno <= lastRemovedSegNo)
3448 : {
3449 : char filename[MAXFNAMELEN];
3450 :
2028 andres 3451 UIC 0 : XLogFileName(filename, tli, segno, wal_segment_size);
1952 tgl 3452 0 : errno = save_errno;
3748 heikki.linnakangas 3453 LBC 0 : ereport(ERROR,
3454 : (errcode_for_file_access(),
3455 : errmsg("requested WAL segment %s has already been removed",
3456 : filename)));
3748 heikki.linnakangas 3457 ECB : }
1952 tgl 3458 CBC 47944 : errno = save_errno;
4745 heikki.linnakangas 3459 47944 : }
3460 :
3324 rhaas 3461 ECB : /*
3462 : * Return the last WAL segment removed, or 0 if no segment has been removed
3463 : * since startup.
3464 : *
3465 : * NB: the result can be out of date arbitrarily fast, the caller has to deal
3466 : * with that.
3467 : */
3468 : XLogSegNo
3324 rhaas 3469 GIC 745 : XLogGetLastRemovedSegno(void)
3324 rhaas 3470 ECB : {
3471 : XLogSegNo lastRemovedSegNo;
3472 :
3121 andres 3473 GIC 745 : SpinLockAcquire(&XLogCtl->info_lck);
3474 745 : lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3121 andres 3475 CBC 745 : SpinLockRelease(&XLogCtl->info_lck);
3476 :
3324 rhaas 3477 745 : return lastRemovedSegNo;
3324 rhaas 3478 ECB : }
3479 :
1097 alvherre 3480 :
4745 heikki.linnakangas 3481 : /*
3482 : * Update the last removed segno pointer in shared memory, to reflect that the
3483 : * given XLOG file has been removed.
3484 : */
3485 : static void
4745 heikki.linnakangas 3486 GIC 573 : UpdateLastRemovedPtr(char *filename)
3487 : {
3488 : uint32 tli;
3489 : XLogSegNo segno;
4745 heikki.linnakangas 3490 ECB :
2028 andres 3491 GIC 573 : XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3492 :
3121 3493 573 : SpinLockAcquire(&XLogCtl->info_lck);
3494 573 : if (segno > XLogCtl->lastRemovedSegNo)
3121 andres 3495 CBC 225 : XLogCtl->lastRemovedSegNo = segno;
3121 andres 3496 GIC 573 : SpinLockRelease(&XLogCtl->info_lck);
4745 heikki.linnakangas 3497 CBC 573 : }
4745 heikki.linnakangas 3498 ECB :
3499 : /*
3500 : * Remove all temporary log files in pg_wal
3501 : *
1731 michael 3502 : * This is called at the beginning of recovery after a previous crash,
3503 : * at a point where no other processes write fresh WAL data.
3504 : */
1731 michael 3505 EUB : static void
1731 michael 3506 GBC 131 : RemoveTempXlogFiles(void)
1731 michael 3507 EUB : {
3508 : DIR *xldir;
1731 michael 3509 ECB : struct dirent *xlde;
3510 :
1731 michael 3511 GIC 131 : elog(DEBUG2, "removing all temporary WAL segments");
3512 :
3513 131 : xldir = AllocateDir(XLOGDIR);
3514 709 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3515 : {
3516 : char path[MAXPGPATH];
3517 :
3518 578 : if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3519 578 : continue;
3520 :
1731 michael 3521 UIC 0 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3522 0 : unlink(path);
1731 michael 3523 LBC 0 : elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3524 : }
1731 michael 3525 GIC 131 : FreeDir(xldir);
3526 131 : }
3527 :
3528 : /*
3529 : * Recycle or remove all log files older or equal to passed segno.
3530 : *
3531 : * endptr is current (or recent) end of xlog, and lastredoptr is the
3532 : * redo pointer of the last checkpoint. These are used to determine
7934 tgl 3533 ECB : * whether we want to recycle rather than delete no-longer-wanted log files.
520 rhaas 3534 : *
3535 : * insertTLI is the current timeline for XLOG insertion. Any recycled
3536 : * segments should be reused for this timeline.
3537 : */
3538 : static void
520 rhaas 3539 GIC 2363 : RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
3540 : TimeLineID insertTLI)
8190 vadim4o 3541 ECB : {
3542 : DIR *xldir;
8053 bruce 3543 : struct dirent *xlde;
3544 : char lastoff[MAXFNAMELEN];
3545 : XLogSegNo endlogSegNo;
814 michael 3546 : XLogSegNo recycleSegNo;
3547 :
3548 : /* Initialize info about where to try to recycle to */
814 michael 3549 GIC 2363 : XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
3550 2363 : recycleSegNo = XLOGfileslop(lastredoptr);
8190 vadim4o 3551 ECB :
3762 heikki.linnakangas 3552 : /*
3553 : * Construct a filename of the last segment to be kept. The timeline ID
3554 : * doesn't matter, we ignore that in the comparison. (During recovery,
3555 : * InsertTimeLineID isn't set, so we can't use that.)
3556 : */
2028 andres 3557 GIC 2363 : XLogFileName(lastoff, 0, segno, wal_segment_size);
3558 :
4605 simon 3559 2363 : elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3560 : lastoff);
3561 :
1952 tgl 3562 2363 : xldir = AllocateDir(XLOGDIR);
3563 :
6488 3564 13798 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3565 : {
2918 heikki.linnakangas 3566 ECB : /* Ignore files that are not XLOG segments */
2893 heikki.linnakangas 3567 GIC 11435 : if (!IsXLogFileName(xlde->d_name) &&
2893 heikki.linnakangas 3568 CBC 7147 : !IsPartialXLogFileName(xlde->d_name))
2918 heikki.linnakangas 3569 GIC 7145 : continue;
3570 :
6838 tgl 3571 ECB : /*
3572 : * We ignore the timeline part of the XLOG segment identifiers in
3260 bruce 3573 : * deciding whether a segment is still needed. This ensures that we
3574 : * won't prematurely remove a segment from a parent timeline. We could
3575 : * probably be a little more proactive about removing segments of
3576 : * non-parent timelines, but that would be a whole lot more
3577 : * complicated.
6836 tgl 3578 : *
6385 bruce 3579 : * We use the alphanumeric sorting property of the filenames to decide
3580 : * which ones are earlier than the lastoff segment.
3581 : */
2918 heikki.linnakangas 3582 GIC 4290 : if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3583 : {
3705 3584 579 : if (XLogArchiveCheckDone(xlde->d_name))
3585 : {
3586 : /* Update the last removed location in shared memory first */
4745 3587 573 : UpdateLastRemovedPtr(xlde->d_name);
3588 :
219 michael 3589 GNC 573 : RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
3590 : }
3591 : }
3592 : }
3593 :
2918 heikki.linnakangas 3594 GIC 2363 : FreeDir(xldir);
3595 2363 : }
2918 heikki.linnakangas 3596 ECB :
3597 : /*
3598 : * Remove WAL files that are not part of the given timeline's history.
3599 : *
3600 : * This is called during recovery, whenever we switch to follow a new
3601 : * timeline, and at the end of recovery when we create a new timeline. We
3602 : * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3603 : * might be leftover pre-allocated or recycled WAL segments on the old timeline
3604 : * that we haven't used yet, and contain garbage. If we just leave them in
3605 : * pg_wal, they will eventually be archived, and we can't let that happen.
3606 : * Files that belong to our timeline history are valid, because we have
3607 : * successfully replayed them, but from others we can't be sure.
3608 : *
3609 : * 'switchpoint' is the current point in WAL where we switch to new timeline,
3610 : * and 'newTLI' is the new timeline we switch to.
3611 : */
3612 : void
2918 heikki.linnakangas 3613 GIC 48 : RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3614 : {
3615 : DIR *xldir;
2918 heikki.linnakangas 3616 ECB : struct dirent *xlde;
3617 : char switchseg[MAXFNAMELEN];
3618 : XLogSegNo endLogSegNo;
3619 : XLogSegNo switchLogSegNo;
3620 : XLogSegNo recycleSegNo;
3621 :
3622 : /*
814 michael 3623 : * Initialize info about where to begin the work. This will recycle,
3624 : * somewhat arbitrarily, 10 future segments.
3625 : */
814 michael 3626 CBC 48 : XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
3627 48 : XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
814 michael 3628 GIC 48 : recycleSegNo = endLogSegNo + 10;
3629 :
3630 : /*
3631 : * Construct a filename of the last segment to be kept.
3632 : */
3633 48 : XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
4959 heikki.linnakangas 3634 ECB :
2918 heikki.linnakangas 3635 CBC 48 : elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3636 : switchseg);
3637 :
1952 tgl 3638 GIC 48 : xldir = AllocateDir(XLOGDIR);
3639 :
2918 heikki.linnakangas 3640 408 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3641 : {
3642 : /* Ignore files that are not XLOG segments */
2893 heikki.linnakangas 3643 CBC 360 : if (!IsXLogFileName(xlde->d_name))
2918 3644 208 : continue;
3645 :
3646 : /*
3647 : * Remove files that are on a timeline older than the new one we're
2878 bruce 3648 ECB : * switching to, but with a segment number >= the first segment on the
3649 : * new timeline.
3650 : */
2918 heikki.linnakangas 3651 GIC 152 : if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
3652 98 : strcmp(xlde->d_name + 8, switchseg + 8) > 0)
3653 : {
3654 : /*
3655 : * If the file has already been marked as .ready, however, don't
3656 : * remove it yet. It should be OK to remove it - files that are
3657 : * not part of our timeline history are not required for recovery
3658 : * - but seems safer to let them be archived and removed later.
3659 : */
3660 12 : if (!XLogArchiveIsReady(xlde->d_name))
219 michael 3661 GNC 12 : RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
3662 : }
3663 : }
2918 heikki.linnakangas 3664 ECB :
2918 heikki.linnakangas 3665 GIC 48 : FreeDir(xldir);
3666 48 : }
3667 :
3668 : /*
3669 : * Recycle or remove a log file that's no longer needed.
3670 : *
3671 : * segment_de is the dirent structure of the segment to recycle or remove.
3672 : * recycleSegNo is the segment number to recycle up to. endlogSegNo is
3673 : * the segment number of the current (or recent) end of WAL.
814 michael 3674 ECB : *
3675 : * endlogSegNo gets incremented if the segment is recycled so as it is not
3676 : * checked again with future callers of this function.
3677 : *
3678 : * insertTLI is the current timeline for XLOG insertion. Any recycled segments
3679 : * should be used for this timeline.
3680 : */
2918 heikki.linnakangas 3681 : static void
219 michael 3682 GNC 585 : RemoveXlogFile(const struct dirent *segment_de,
3683 : XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
3684 : TimeLineID insertTLI)
2918 heikki.linnakangas 3685 ECB : {
3686 : char path[MAXPGPATH];
3687 : #ifdef WIN32
3688 : char newpath[MAXPGPATH];
3689 : #endif
219 michael 3690 GNC 585 : const char *segname = segment_de->d_name;
3691 :
2918 heikki.linnakangas 3692 CBC 585 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
3693 :
2918 heikki.linnakangas 3694 ECB : /*
3695 : * Before deleting the file, see if it can be recycled as a future log
3696 : * segment. Only recycle normal files, because we don't want to recycle
3697 : * symbolic links pointing to a separate archive directory.
3698 : */
1468 tmunro 3699 GIC 585 : if (wal_recycle &&
814 michael 3700 585 : *endlogSegNo <= recycleSegNo &&
650 noah 3701 CBC 1111 : XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
219 michael 3702 GNC 1106 : get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
814 michael 3703 GIC 553 : InstallXLogFileSegment(endlogSegNo, path,
3704 : true, recycleSegNo, insertTLI))
3705 : {
2918 heikki.linnakangas 3706 533 : ereport(DEBUG2,
3707 : (errmsg_internal("recycled write-ahead log file \"%s\"",
3708 : segname)));
3709 533 : CheckpointStats.ckpt_segs_recycled++;
3710 : /* Needn't recheck that slot on future iterations */
814 michael 3711 533 : (*endlogSegNo)++;
3712 : }
3713 : else
3714 : {
3715 : /* No need for any more future segments, or recycling failed ... */
3716 : int rc;
3717 :
2918 heikki.linnakangas 3718 52 : ereport(DEBUG2,
3719 : (errmsg_internal("removing write-ahead log file \"%s\"",
3720 : segname)));
3721 :
3722 : #ifdef WIN32
3723 :
3724 : /*
3725 : * On Windows, if another process (e.g another backend) holds the file
3726 : * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
3727 : * will still show up in directory listing until the last handle is
2878 bruce 3728 ECB : * closed. To avoid confusing the lingering deleted file for a live
3729 : * WAL file that needs to be archived, rename it before deleting it.
2918 heikki.linnakangas 3730 : *
3731 : * If another process holds the file open without FILE_SHARE_DELETE
3732 : * flag, rename will fail. We'll try again at the next checkpoint.
2918 heikki.linnakangas 3733 EUB : */
3734 : snprintf(newpath, MAXPGPATH, "%s.deleted", path);
2918 heikki.linnakangas 3735 ECB : if (rename(path, newpath) != 0)
3736 : {
3737 : ereport(LOG,
3738 : (errcode_for_file_access(),
3739 : errmsg("could not rename file \"%s\": %m",
3740 : path)));
3741 : return;
3742 : }
3743 : rc = durable_unlink(newpath, LOG);
3744 : #else
2204 teodor 3745 GIC 52 : rc = durable_unlink(path, LOG);
3746 : #endif
2918 heikki.linnakangas 3747 52 : if (rc != 0)
3748 : {
3749 : /* Message already logged by durable_unlink() */
2918 heikki.linnakangas 3750 UIC 0 : return;
3751 : }
2918 heikki.linnakangas 3752 GIC 52 : CheckpointStats.ckpt_segs_removed++;
3753 : }
3754 :
2918 heikki.linnakangas 3755 CBC 585 : XLogArchiveCleanup(segname);
3756 : }
3757 :
3758 : /*
3759 : * Verify whether pg_wal and pg_wal/archive_status exist.
3760 : * If the latter does not exist, recreate it.
5264 tgl 3761 ECB : *
3762 : * It is not the goal of this function to verify the contents of these
5264 tgl 3763 EUB : * directories, but to help in cases where someone has performed a cluster
3764 : * copy for PITR purposes but omitted pg_wal from the copy.
3765 : *
3766 : * We could also recreate pg_wal if it doesn't exist, but a deliberate
3767 : * policy decision was made not to. It is fairly common for pg_wal to be
5264 tgl 3768 ECB : * a symlink, and if that was the DBA's intent then automatically making a
3769 : * plain directory would result in degraded performance with no notice.
3770 : */
3771 : static void
5264 tgl 3772 CBC 1176 : ValidateXLOGDirectoryStructure(void)
5264 tgl 3773 EUB : {
3774 : char path[MAXPGPATH];
3775 : struct stat stat_buf;
3776 :
3777 : /* Check for pg_wal; if it doesn't exist, error out */
5264 tgl 3778 GIC 1176 : if (stat(XLOGDIR, &stat_buf) != 0 ||
5264 tgl 3779 GBC 1176 : !S_ISDIR(stat_buf.st_mode))
5050 bruce 3780 UIC 0 : ereport(FATAL,
5264 tgl 3781 EUB : (errmsg("required WAL directory \"%s\" does not exist",
3782 : XLOGDIR)));
3783 :
3784 : /* Check for archive_status */
5264 tgl 3785 GIC 1176 : snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
5264 tgl 3786 CBC 1176 : if (stat(path, &stat_buf) == 0)
3787 : {
3788 : /* Check for weird cases where it exists but isn't a directory */
5264 tgl 3789 GIC 1176 : if (!S_ISDIR(stat_buf.st_mode))
5050 bruce 3790 UIC 0 : ereport(FATAL,
3791 : (errmsg("required WAL directory \"%s\" does not exist",
3792 : path)));
3793 : }
5264 tgl 3794 ECB : else
3795 : {
5264 tgl 3796 UIC 0 : ereport(LOG,
3797 : (errmsg("creating missing WAL directory \"%s\"", path)));
1828 sfrost 3798 0 : if (MakePGDirectory(path) < 0)
5050 bruce 3799 0 : ereport(FATAL,
5264 tgl 3800 ECB : (errmsg("could not create missing directory \"%s\": %m",
3801 : path)));
3802 : }
5264 tgl 3803 GIC 1176 : }
5264 tgl 3804 ECB :
3805 : /*
6135 3806 : * Remove previous backup history files. This also retries creation of
3807 : * .ready files for any backup history files for which XLogArchiveNotify
3808 : * failed earlier.
3809 : */
6507 bruce 3810 : static void
6135 tgl 3811 CBC 117 : CleanupBackupHistory(void)
6507 bruce 3812 ECB : {
3813 : DIR *xldir;
3814 : struct dirent *xlde;
3815 : char path[MAXPGPATH + sizeof(XLOGDIR)];
3816 :
6488 tgl 3817 CBC 117 : xldir = AllocateDir(XLOGDIR);
6507 bruce 3818 ECB :
6488 tgl 3819 GIC 1047 : while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3820 : {
2893 heikki.linnakangas 3821 813 : if (IsBackupHistoryFileName(xlde->d_name))
3822 : {
5326 tgl 3823 118 : if (XLogArchiveCheckDone(xlde->d_name))
3824 : {
2158 peter_e 3825 108 : elog(DEBUG2, "removing WAL backup history file \"%s\"",
3826 : xlde->d_name);
2189 3827 108 : snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
6507 bruce 3828 108 : unlink(path);
3829 108 : XLogArchiveCleanup(xlde->d_name);
3830 : }
3831 : }
3832 : }
3833 :
3834 117 : FreeDir(xldir);
3835 117 : }
3836 :
8062 tgl 3837 ECB : /*
3838 : * I/O routines for pg_control
3839 : *
3840 : * *ControlFile is a buffer in shared memory that holds an image of the
3841 : * contents of pg_control. WriteControlFile() initializes pg_control
3842 : * given a preloaded buffer, ReadControlFile() loads the buffer from
3843 : * the pg_control file (during postmaster or standalone-backend startup),
3844 : * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3845 : * InitControlFile() fills the buffer with initial values.
3846 : *
417 heikki.linnakangas 3847 : * For simplicity, WriteControlFile() initializes the fields of pg_control
417 heikki.linnakangas 3848 EUB : * that are related to checking backend/database compatibility, and
3849 : * ReadControlFile() verifies they are correct. We could split out the
3850 : * I/O and compatibility-check functions, but there seems no need currently.
3851 : */
699 tmunro 3852 ECB :
3853 : static void
417 heikki.linnakangas 3854 CBC 305 : InitControlFile(uint64 sysidentifier)
417 heikki.linnakangas 3855 ECB : {
3856 : char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
8595 vadim4o 3857 :
3858 : /*
3859 : * Generate a random nonce. This is used for authentication requests that
417 heikki.linnakangas 3860 : * will fail because the user does not exist. The nonce is used to create
3861 : * a genuine-looking password challenge for the non-existent user, in lieu
3862 : * of an actual stored password.
3863 : */
417 heikki.linnakangas 3864 CBC 305 : if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
417 heikki.linnakangas 3865 LBC 0 : ereport(PANIC,
417 heikki.linnakangas 3866 ECB : (errcode(ERRCODE_INTERNAL_ERROR),
3867 : errmsg("could not generate secret authorization token")));
8595 vadim4o 3868 :
417 heikki.linnakangas 3869 CBC 305 : memset(ControlFile, 0, sizeof(ControlFileData));
3870 : /* Initialize pg_control status fields */
417 heikki.linnakangas 3871 GIC 305 : ControlFile->system_identifier = sysidentifier;
417 heikki.linnakangas 3872 CBC 305 : memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
417 heikki.linnakangas 3873 GIC 305 : ControlFile->state = DB_SHUTDOWNED;
3874 305 : ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
3875 :
3876 : /* Set important parameter values for use when replaying WAL */
1147 peter 3877 305 : ControlFile->MaxConnections = MaxConnections;
3878 305 : ControlFile->max_worker_processes = max_worker_processes;
3879 305 : ControlFile->max_wal_senders = max_wal_senders;
1147 peter 3880 CBC 305 : ControlFile->max_prepared_xacts = max_prepared_xacts;
3881 305 : ControlFile->max_locks_per_xact = max_locks_per_xact;
1147 peter 3882 GIC 305 : ControlFile->wal_level = wal_level;
1147 peter 3883 CBC 305 : ControlFile->wal_log_hints = wal_log_hints;
3884 305 : ControlFile->track_commit_timestamp = track_commit_timestamp;
1147 peter 3885 GIC 305 : ControlFile->data_checksum_version = bootstrap_data_checksum_version;
1147 peter 3886 CBC 305 : }
1147 peter 3887 ECB :
8170 tgl 3888 : static void
8170 tgl 3889 CBC 305 : WriteControlFile(void)
3890 : {
8170 tgl 3891 ECB : int fd;
2090 3892 : char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
3893 :
3894 : /*
8062 3895 : * Initialize version and compatibility-check fields
3896 : */
8062 tgl 3897 GIC 305 : ControlFile->pg_control_version = PG_CONTROL_VERSION;
3898 305 : ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3899 :
6397 3900 305 : ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3901 305 : ControlFile->floatFormat = FLOATFORMAT_VALUE;
3902 :
8170 3903 305 : ControlFile->blcksz = BLCKSZ;
8170 tgl 3904 CBC 305 : ControlFile->relseg_size = RELSEG_SIZE;
6215 3905 305 : ControlFile->xlog_blcksz = XLOG_BLCKSZ;
2028 andres 3906 GIC 305 : ControlFile->xlog_seg_size = wal_segment_size;
7658 lockhart 3907 ECB :
7658 lockhart 3908 GIC 305 : ControlFile->nameDataLen = NAMEDATALEN;
6585 tgl 3909 CBC 305 : ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
7658 lockhart 3910 EUB :
5850 tgl 3911 GIC 305 : ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3230 3912 305 : ControlFile->loblksize = LOBLKSIZE;
3913 :
5466 3914 305 : ControlFile->float8ByVal = FLOAT8PASSBYVAL;
7658 lockhart 3915 ECB :
8062 tgl 3916 : /* Contents are protected with a CRC */
3078 heikki.linnakangas 3917 CBC 305 : INIT_CRC32C(ControlFile->crc);
3078 heikki.linnakangas 3918 GIC 305 : COMP_CRC32C(ControlFile->crc,
3919 : (char *) ControlFile,
3078 heikki.linnakangas 3920 EUB : offsetof(ControlFileData, crc));
3078 heikki.linnakangas 3921 GBC 305 : FIN_CRC32C(ControlFile->crc);
8062 tgl 3922 EUB :
3923 : /*
3924 : * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
3925 : * the excess over sizeof(ControlFileData). This reduces the odds of
3926 : * premature-EOF errors when reading pg_control. We'll still fail when we
6214 tgl 3927 ECB : * check the contents of the file, but hopefully with a more specific
3928 : * error than "couldn't read pg_control".
8170 3929 : */
2090 tgl 3930 CBC 305 : memset(buffer, 0, PG_CONTROL_FILE_SIZE);
8170 tgl 3931 GBC 305 : memcpy(buffer, ControlFile, sizeof(ControlFileData));
3932 :
6488 tgl 3933 GIC 305 : fd = BasicOpenFile(XLOG_CONTROL_FILE,
3934 : O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
8170 tgl 3935 CBC 305 : if (fd < 0)
7202 tgl 3936 UIC 0 : ereport(PANIC,
7202 tgl 3937 ECB : (errcode_for_file_access(),
1721 michael 3938 EUB : errmsg("could not create file \"%s\": %m",
3939 : XLOG_CONTROL_FILE)));
3940 :
7977 tgl 3941 GIC 305 : errno = 0;
2213 rhaas 3942 CBC 305 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
2090 tgl 3943 GIC 305 : if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
3944 : {
7977 tgl 3945 ECB : /* if write didn't set errno, assume problem is no disk space */
7977 tgl 3946 UIC 0 : if (errno == 0)
3947 0 : errno = ENOSPC;
7202 3948 0 : ereport(PANIC,
3949 : (errcode_for_file_access(),
3950 : errmsg("could not write to file \"%s\": %m",
3951 : XLOG_CONTROL_FILE)));
3952 : }
2213 rhaas 3953 GIC 305 : pgstat_report_wait_end();
3954 :
2213 rhaas 3955 CBC 305 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
8157 tgl 3956 GIC 305 : if (pg_fsync(fd) != 0)
7202 tgl 3957 LBC 0 : ereport(PANIC,
7202 tgl 3958 EUB : (errcode_for_file_access(),
3959 : errmsg("could not fsync file \"%s\": %m",
3960 : XLOG_CONTROL_FILE)));
2213 rhaas 3961 GIC 305 : pgstat_report_wait_end();
3962 :
1373 peter 3963 CBC 305 : if (close(fd) != 0)
7013 tgl 3964 LBC 0 : ereport(PANIC,
7013 tgl 3965 ECB : (errcode_for_file_access(),
3966 : errmsg("could not close file \"%s\": %m",
1721 michael 3967 EUB : XLOG_CONTROL_FILE)));
8170 tgl 3968 GBC 305 : }
3969 :
3970 : static void
8170 tgl 3971 GIC 1222 : ReadControlFile(void)
3972 : {
2917 heikki.linnakangas 3973 EUB : pg_crc32c crc;
3974 : int fd;
3975 : static char wal_segsz_str[20];
3976 : int r;
3977 :
8170 tgl 3978 ECB : /*
3979 : * Read data...
3980 : */
6488 tgl 3981 GIC 1222 : fd = BasicOpenFile(XLOG_CONTROL_FILE,
3982 : O_RDWR | PG_BINARY);
8170 3983 1222 : if (fd < 0)
7202 tgl 3984 UIC 0 : ereport(PANIC,
3985 : (errcode_for_file_access(),
3986 : errmsg("could not open file \"%s\": %m",
3987 : XLOG_CONTROL_FILE)));
3988 :
2213 rhaas 3989 CBC 1222 : pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
1787 magnus 3990 GBC 1222 : r = read(fd, ControlFile, sizeof(ControlFileData));
1787 magnus 3991 GIC 1222 : if (r != sizeof(ControlFileData))
3992 : {
1787 magnus 3993 UIC 0 : if (r < 0)
3994 0 : ereport(PANIC,
3995 : (errcode_for_file_access(),
3996 : errmsg("could not read file \"%s\": %m",
3997 : XLOG_CONTROL_FILE)));
1787 magnus 3998 ECB : else
1787 magnus 3999 UBC 0 : ereport(PANIC,
4000 : (errcode(ERRCODE_DATA_CORRUPTED),
4001 : errmsg("could not read file \"%s\": read %d of %zu",
4002 : XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4003 : }
2213 rhaas 4004 GIC 1222 : pgstat_report_wait_end();
4005 :
8170 tgl 4006 1222 : close(fd);
8170 tgl 4007 ECB :
8062 4008 : /*
4009 : * Check for expected pg_control format version. If this is wrong, the
4010 : * CRC check will likely fail because we'll be checking the wrong number
6385 bruce 4011 : * of bytes. Complaining about wrong version will probably be more
4012 : * enlightening than complaining about wrong CRC.
8062 tgl 4013 : */
5557 peter_e 4014 EUB :
5557 peter_e 4015 GIC 1222 : if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
5557 peter_e 4016 UIC 0 : ereport(FATAL,
4017 : (errmsg("database files are incompatible with server"),
4018 : errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4019 : " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4020 : ControlFile->pg_control_version, ControlFile->pg_control_version,
4021 : PG_CONTROL_VERSION, PG_CONTROL_VERSION),
5557 peter_e 4022 ECB : errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
5557 peter_e 4023 EUB :
8062 tgl 4024 GIC 1222 : if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
7202 tgl 4025 UIC 0 : ereport(FATAL,
4026 : (errmsg("database files are incompatible with server"),
4027 : errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4028 : " but the server was compiled with PG_CONTROL_VERSION %d.",
2118 tgl 4029 ECB : ControlFile->pg_control_version, PG_CONTROL_VERSION),
7202 tgl 4030 EUB : errhint("It looks like you need to initdb.")));
4031 :
4032 : /* Now check the CRC. */
3078 heikki.linnakangas 4033 GIC 1222 : INIT_CRC32C(crc);
4034 1222 : COMP_CRC32C(crc,
4035 : (char *) ControlFile,
3078 heikki.linnakangas 4036 ECB : offsetof(ControlFileData, crc));
3078 heikki.linnakangas 4037 GBC 1222 : FIN_CRC32C(crc);
4038 :
3078 heikki.linnakangas 4039 GIC 1222 : if (!EQ_CRC32C(crc, ControlFile->crc))
7202 tgl 4040 UIC 0 : ereport(FATAL,
7136 peter_e 4041 ECB : (errmsg("incorrect checksum in control file")));
8137 vadim4o 4042 EUB :
4043 : /*
4044 : * Do compatibility checking immediately. If the database isn't
4045 : * compatible with the backend executable, we want to abort before we can
4046 : * possibly do any damage.
4047 : */
8062 tgl 4048 CBC 1222 : if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
7202 tgl 4049 UBC 0 : ereport(FATAL,
4050 : (errmsg("database files are incompatible with server"),
4051 : errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4052 : " but the server was compiled with CATALOG_VERSION_NO %d.",
4053 : ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4054 : errhint("It looks like you need to initdb.")));
6397 tgl 4055 CBC 1222 : if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
6397 tgl 4056 UBC 0 : ereport(FATAL,
4057 : (errmsg("database files are incompatible with server"),
4058 : errdetail("The database cluster was initialized with MAXALIGN %d,"
4059 : " but the server was compiled with MAXALIGN %d.",
4060 : ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4061 : errhint("It looks like you need to initdb.")));
6397 tgl 4062 CBC 1222 : if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
6397 tgl 4063 UBC 0 : ereport(FATAL,
4064 : (errmsg("database files are incompatible with server"),
4065 : errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4066 : errhint("It looks like you need to initdb.")));
8170 tgl 4067 GIC 1222 : if (ControlFile->blcksz != BLCKSZ)
7202 tgl 4068 UIC 0 : ereport(FATAL,
7202 tgl 4069 ECB : (errmsg("database files are incompatible with server"),
2118 tgl 4070 EUB : errdetail("The database cluster was initialized with BLCKSZ %d,"
4071 : " but the server was compiled with BLCKSZ %d.",
4072 : ControlFile->blcksz, BLCKSZ),
4073 : errhint("It looks like you need to recompile or initdb.")));
8170 tgl 4074 GIC 1222 : if (ControlFile->relseg_size != RELSEG_SIZE)
7202 tgl 4075 UIC 0 : ereport(FATAL,
7202 tgl 4076 ECB : (errmsg("database files are incompatible with server"),
2118 tgl 4077 EUB : errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4078 : " but the server was compiled with RELSEG_SIZE %d.",
4079 : ControlFile->relseg_size, RELSEG_SIZE),
4080 : errhint("It looks like you need to recompile or initdb.")));
6215 tgl 4081 GIC 1222 : if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
6215 tgl 4082 UIC 0 : ereport(FATAL,
6215 tgl 4083 ECB : (errmsg("database files are incompatible with server"),
2118 tgl 4084 EUB : errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4085 : " but the server was compiled with XLOG_BLCKSZ %d.",
4086 : ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4087 : errhint("It looks like you need to recompile or initdb.")));
7658 lockhart 4088 GIC 1222 : if (ControlFile->nameDataLen != NAMEDATALEN)
7202 tgl 4089 UIC 0 : ereport(FATAL,
4090 : (errmsg("database files are incompatible with server"),
4091 : errdetail("The database cluster was initialized with NAMEDATALEN %d,"
2118 tgl 4092 ECB : " but the server was compiled with NAMEDATALEN %d.",
2118 tgl 4093 EUB : ControlFile->nameDataLen, NAMEDATALEN),
4094 : errhint("It looks like you need to recompile or initdb.")));
6585 tgl 4095 GIC 1222 : if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
7202 tgl 4096 UIC 0 : ereport(FATAL,
4097 : (errmsg("database files are incompatible with server"),
4098 : errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4099 : " but the server was compiled with INDEX_MAX_KEYS %d.",
4100 : ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4101 : errhint("It looks like you need to recompile or initdb.")));
5850 tgl 4102 GIC 1222 : if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
5850 tgl 4103 UIC 0 : ereport(FATAL,
4104 : (errmsg("database files are incompatible with server"),
4105 : errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4106 : " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
2118 tgl 4107 ECB : ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4108 : errhint("It looks like you need to recompile or initdb.")));
3230 tgl 4109 CBC 1222 : if (ControlFile->loblksize != LOBLKSIZE)
3230 tgl 4110 UBC 0 : ereport(FATAL,
4111 : (errmsg("database files are incompatible with server"),
4112 : errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4113 : " but the server was compiled with LOBLKSIZE %d.",
4114 : ControlFile->loblksize, (int) LOBLKSIZE),
4115 : errhint("It looks like you need to recompile or initdb.")));
7658 lockhart 4116 ECB :
5466 tgl 4117 : #ifdef USE_FLOAT8_BYVAL
5466 tgl 4118 GIC 1222 : if (ControlFile->float8ByVal != true)
5466 tgl 4119 UIC 0 : ereport(FATAL,
4120 : (errmsg("database files are incompatible with server"),
5466 tgl 4121 ECB : errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
2118 tgl 4122 EUB : " but the server was compiled with USE_FLOAT8_BYVAL."),
4123 : errhint("It looks like you need to recompile or initdb.")));
4124 : #else
5466 tgl 4125 ECB : if (ControlFile->float8ByVal != false)
5466 tgl 4126 EUB : ereport(FATAL,
4127 : (errmsg("database files are incompatible with server"),
4128 : errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
2118 tgl 4129 ECB : " but the server was compiled without USE_FLOAT8_BYVAL."),
5466 4130 : errhint("It looks like you need to recompile or initdb.")));
4131 : #endif
4132 :
2028 andres 4133 CBC 1222 : wal_segment_size = ControlFile->xlog_seg_size;
4134 :
2028 andres 4135 GIC 1222 : if (!IsValidWalSegSize(wal_segment_size))
2028 andres 4136 LBC 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4137 : errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
1788 peter_e 4138 ECB : "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4139 : wal_segment_size,
4140 : wal_segment_size)));
4141 :
2028 andres 4142 GIC 1222 : snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4143 1222 : SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4144 : PGC_S_DYNAMIC_DEFAULT);
2028 andres 4145 ECB :
4146 : /* check and update variables dependent on wal_segment_size */
2028 andres 4147 CBC 1222 : if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
2028 andres 4148 LBC 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4149 : errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4150 :
2028 andres 4151 GIC 1222 : if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
2028 andres 4152 UIC 0 : ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4153 : errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
2028 andres 4154 ECB :
2028 andres 4155 GIC 1222 : UsableBytesInSegment =
2028 andres 4156 CBC 1222 : (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
2028 andres 4157 ECB : (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4158 :
2028 andres 4159 GIC 1222 : CalculateCheckpointSegments();
4160 :
4161 : /* Make the initdb settings visible as GUC variables, too */
1826 magnus 4162 1222 : SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4163 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
8170 tgl 4164 CBC 1222 : }
4165 :
1483 michael 4166 ECB : /*
4167 : * Utility wrapper to update the control file. Note that the control
4168 : * file gets flushed.
4169 : */
4170 : static void
8170 tgl 4171 GIC 10909 : UpdateControlFile(void)
4172 : {
1469 peter 4173 10909 : update_controlfile(DataDir, ControlFile, true);
8586 vadim4o 4174 CBC 10909 : }
4175 :
4832 heikki.linnakangas 4176 ECB : /*
4177 : * Returns the unique system identifier from control file.
4178 : */
4179 : uint64
4832 heikki.linnakangas 4180 GIC 926 : GetSystemIdentifier(void)
4181 : {
4182 926 : Assert(ControlFile != NULL);
4183 926 : return ControlFile->system_identifier;
4184 : }
4185 :
4186 : /*
4187 : * Returns the random nonce from control file.
4188 : */
4189 : char *
2224 heikki.linnakangas 4190 CBC 1 : GetMockAuthenticationNonce(void)
4191 : {
2224 heikki.linnakangas 4192 GIC 1 : Assert(ControlFile != NULL);
4193 1 : return ControlFile->mock_authentication_nonce;
4194 : }
2224 heikki.linnakangas 4195 ECB :
3670 simon 4196 : /*
4197 : * Are checksums enabled for data pages?
4198 : */
4199 : bool
1826 magnus 4200 GIC 14333955 : DataChecksumsEnabled(void)
4201 : {
3670 simon 4202 14333955 : Assert(ControlFile != NULL);
3631 4203 14333955 : return (ControlFile->data_checksum_version > 0);
4204 : }
4205 :
4206 : /*
4207 : * Returns a fake LSN for unlogged relations.
4208 : *
4209 : * Each call generates an LSN that is greater than any previous value
4210 : * returned. The current counter value is saved and restored across clean
4211 : * shutdowns, but like unlogged relations, does not survive a crash. This can
4212 : * be used in lieu of real LSN values returned by XLogInsert, if you need an
4213 : * LSN-like increasing sequence of numbers without writing any WAL.
3709 heikki.linnakangas 4214 ECB : */
4215 : XLogRecPtr
3709 heikki.linnakangas 4216 GIC 33 : GetFakeLSNForUnloggedRel(void)
4217 : {
3602 bruce 4218 ECB : XLogRecPtr nextUnloggedLSN;
3709 heikki.linnakangas 4219 :
4220 : /* increment the unloggedLSN counter, need SpinLock */
3121 andres 4221 CBC 33 : SpinLockAcquire(&XLogCtl->ulsn_lck);
4222 33 : nextUnloggedLSN = XLogCtl->unloggedLSN++;
4223 33 : SpinLockRelease(&XLogCtl->ulsn_lck);
4224 :
3709 heikki.linnakangas 4225 GIC 33 : return nextUnloggedLSN;
4226 : }
4227 :
4228 : /*
4229 : * Auto-tune the number of XLOG buffers.
4460 tgl 4230 ECB : *
4231 : * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4232 : * a maximum of one XLOG segment (there is little reason to think that more
4233 : * is helpful, at least so long as we force an fsync when switching log files)
4234 : * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4385 4235 : * 9.1, when auto-tuning was added).
4236 : *
4237 : * This should not be called until NBuffers has received its final value.
4238 : */
4239 : static int
4385 tgl 4240 GIC 1825 : XLOGChooseNumBuffers(void)
4460 tgl 4241 ECB : {
4385 4242 : int xbuffers;
4243 :
4385 tgl 4244 GIC 1825 : xbuffers = NBuffers / 32;
2028 andres 4245 GBC 1825 : if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
2028 andres 4246 GIC 18 : xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4385 tgl 4247 1825 : if (xbuffers < 8)
4248 327 : xbuffers = 8;
4249 1825 : return xbuffers;
4250 : }
4251 :
4252 : /*
4253 : * GUC check_hook for wal_buffers
4254 : */
4385 tgl 4255 ECB : bool
4385 tgl 4256 GBC 3682 : check_wal_buffers(int *newval, void **extra, GucSource source)
4257 : {
4385 tgl 4258 ECB : /*
4259 : * -1 indicates a request for auto-tune.
4260 : */
4385 tgl 4261 GIC 3682 : if (*newval == -1)
4262 : {
4263 : /*
4264 : * If we haven't yet changed the boot_val default of -1, just let it
3260 bruce 4265 ECB : * be. We'll fix it when XLOGShmemSize is called.
4266 : */
4385 tgl 4267 GIC 1857 : if (XLOGbuffers == -1)
4268 1857 : return true;
4269 :
4270 : /* Otherwise, substitute the auto-tune value */
4385 tgl 4271 UIC 0 : *newval = XLOGChooseNumBuffers();
4272 : }
4385 tgl 4273 ECB :
4274 : /*
4275 : * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4276 : * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4277 : * the case, we just silently treat such values as a request for the
4278 : * minimum. (We could throw an error instead, but that doesn't seem very
4279 : * helpful.)
4280 : */
4385 tgl 4281 GIC 1825 : if (*newval < 4)
4385 tgl 4282 UBC 0 : *newval = 4;
4385 tgl 4283 EUB :
4385 tgl 4284 GBC 1825 : return true;
4460 tgl 4285 EUB : }
4286 :
4287 : /*
4288 : * GUC check_hook for wal_consistency_checking
4289 : */
4290 : bool
208 tgl 4291 GNC 1859 : check_wal_consistency_checking(char **newval, void **extra, GucSource source)
4292 : {
4293 : char *rawstring;
4294 : List *elemlist;
4295 : ListCell *l;
4296 : bool newwalconsistency[RM_MAX_ID + 1];
4297 :
4298 : /* Initialize the array */
4299 61347 : MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
4300 :
4301 : /* Need a modifiable copy of string */
4302 1859 : rawstring = pstrdup(*newval);
4303 :
4304 : /* Parse string into list of identifiers */
4305 1859 : if (!SplitIdentifierString(rawstring, ',', &elemlist))
4306 : {
4307 : /* syntax error in list */
208 tgl 4308 UNC 0 : GUC_check_errdetail("List syntax is invalid.");
4309 0 : pfree(rawstring);
4310 0 : list_free(elemlist);
4311 0 : return false;
4312 : }
4313 :
208 tgl 4314 GNC 1861 : foreach(l, elemlist)
4315 : {
4316 2 : char *tok = (char *) lfirst(l);
4317 : int rmid;
4318 :
4319 : /* Check for 'all'. */
4320 2 : if (pg_strcasecmp(tok, "all") == 0)
4321 : {
208 tgl 4322 UNC 0 : for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4323 0 : if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
4324 0 : newwalconsistency[rmid] = true;
4325 : }
4326 : else
4327 : {
4328 : /* Check if the token matches any known resource manager. */
208 tgl 4329 GNC 2 : bool found = false;
4330 :
4331 36 : for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4332 : {
4333 54 : if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
4334 18 : pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
4335 : {
4336 2 : newwalconsistency[rmid] = true;
4337 2 : found = true;
4338 2 : break;
4339 : }
4340 : }
4341 2 : if (!found)
4342 : {
4343 : /*
4344 : * During startup, it might be a not-yet-loaded custom
4345 : * resource manager. Defer checking until
4346 : * InitializeWalConsistencyChecking().
4347 : */
208 tgl 4348 UNC 0 : if (!process_shared_preload_libraries_done)
4349 : {
4350 0 : check_wal_consistency_checking_deferred = true;
4351 : }
4352 : else
4353 : {
4354 0 : GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
4355 0 : pfree(rawstring);
4356 0 : list_free(elemlist);
4357 0 : return false;
4358 : }
4359 : }
4360 : }
4361 : }
4362 :
208 tgl 4363 GNC 1859 : pfree(rawstring);
4364 1859 : list_free(elemlist);
4365 :
4366 : /* assign new value */
4367 1859 : *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
4368 1859 : memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
4369 1859 : return true;
4370 : }
4371 :
4372 : /*
4373 : * GUC assign_hook for wal_consistency_checking
4374 : */
4375 : void
4376 1859 : assign_wal_consistency_checking(const char *newval, void *extra)
4377 : {
4378 : /*
4379 : * If some checks were deferred, it's possible that the checks will fail
4380 : * later during InitializeWalConsistencyChecking(). But in that case, the
4381 : * postmaster will exit anyway, so it's safe to proceed with the
4382 : * assignment.
4383 : *
4384 : * Any built-in resource managers specified are assigned immediately,
4385 : * which affects WAL created before shared_preload_libraries are
4386 : * processed. Any custom resource managers specified won't be assigned
4387 : * until after shared_preload_libraries are processed, but that's OK
4388 : * because WAL for a custom resource manager can't be written before the
4389 : * module is loaded anyway.
4390 : */
4391 1859 : wal_consistency_checking = extra;
4392 1859 : }
4393 :
4394 : /*
4395 : * InitializeWalConsistencyChecking: run after loading custom resource managers
4396 : *
4397 : * If any unknown resource managers were specified in the
4398 : * wal_consistency_checking GUC, processing was deferred. Now that
4399 : * shared_preload_libraries have been loaded, process wal_consistency_checking
4400 : * again.
4401 : */
4402 : void
4403 910 : InitializeWalConsistencyChecking(void)
4404 : {
4405 910 : Assert(process_shared_preload_libraries_done);
4406 :
4407 910 : if (check_wal_consistency_checking_deferred)
4408 : {
4409 : struct config_generic *guc;
4410 :
208 tgl 4411 UNC 0 : guc = find_option("wal_consistency_checking", false, false, ERROR);
4412 :
4413 0 : check_wal_consistency_checking_deferred = false;
4414 :
4415 0 : set_config_option_ext("wal_consistency_checking",
4416 : wal_consistency_checking_string,
4417 : guc->scontext, guc->source, guc->srole,
4418 : GUC_ACTION_SET, true, ERROR, false);
4419 :
4420 : /* checking should not be deferred again */
4421 0 : Assert(!check_wal_consistency_checking_deferred);
4422 : }
208 tgl 4423 GNC 910 : }
4424 :
4425 : /*
4426 : * GUC show_hook for archive_command
4427 : */
4428 : const char *
4429 1088 : show_archive_command(void)
4430 : {
4431 1088 : if (XLogArchivingActive())
208 tgl 4432 UNC 0 : return XLogArchiveCommand;
4433 : else
208 tgl 4434 GNC 1088 : return "(disabled)";
4435 : }
4436 :
4437 : /*
4438 : * GUC show_hook for in_hot_standby
4439 : */
4440 : const char *
4441 9962 : show_in_hot_standby(void)
4442 : {
4443 : /*
4444 : * We display the actual state based on shared memory, so that this GUC
4445 : * reports up-to-date state if examined intra-query. The underlying
4446 : * variable (in_hot_standby_guc) changes only when we transmit a new value
4447 : * to the client.
4448 : */
4449 9962 : return RecoveryInProgress() ? "on" : "off";
4450 : }
4451 :
4452 : /*
2034 andres 4453 ECB : * Read the control file, set respective GUCs.
4454 : *
2030 4455 : * This is to be called during startup, including a crash recovery cycle,
4456 : * unless in bootstrap mode, where no control file yet exists. As there's no
4457 : * usable shared memory yet (its sizing can depend on the contents of the
4458 : * control file!), first store the contents in local memory. XLOGShmemInit()
4459 : * will then copy it to shared memory later.
4460 : *
2030 andres 4461 EUB : * reset just controls whether previous contents are to be expected (in the
4462 : * reset case, there's a dangling pointer into old shared memory), or not.
2034 4463 : */
4464 : void
2030 andres 4465 GIC 917 : LocalProcessControlFile(bool reset)
4466 : {
4467 917 : Assert(reset || ControlFile == NULL);
2034 andres 4468 CBC 917 : ControlFile = palloc(sizeof(ControlFileData));
2034 andres 4469 GIC 917 : ReadControlFile();
2034 andres 4470 CBC 917 : }
4471 :
4472 : /*
4473 : * Get the wal_level from the control file. For a standby, this value should be
4474 : * considered as its active wal_level, because it may be different from what
4475 : * was originally configured on standby.
4476 : */
4477 : WalLevel
1 andres 4478 GNC 61 : GetActiveWalLevelOnStandby(void)
4479 : {
4480 61 : return ControlFile->wal_level;
4481 : }
4482 :
8170 tgl 4483 ECB : /*
8062 4484 : * Initialization of shared memory for XLOG
4485 : */
6441 4486 : Size
8174 peter_e 4487 CBC 4564 : XLOGShmemSize(void)
8586 vadim4o 4488 ECB : {
4489 : Size size;
4490 :
4385 tgl 4491 : /*
4492 : * If the value of wal_buffers is -1, use the preferred auto-tune value.
4493 : * This isn't an amazingly clean place to do this, but we must wait till
4494 : * NBuffers has received its final value, and must do it before using the
4495 : * value of XLOGbuffers to do anything important.
4496 : *
4497 : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
305 tgl 4498 EUB : * However, if the DBA explicitly set wal_buffers = -1 in the config file,
4499 : * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
4500 : * the matter with PGC_S_OVERRIDE.
4501 : */
4385 tgl 4502 GIC 4564 : if (XLOGbuffers == -1)
4503 : {
4385 tgl 4504 EUB : char buf[32];
4505 :
4385 tgl 4506 GBC 1825 : snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
305 4507 1825 : SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
4508 : PGC_S_DYNAMIC_DEFAULT);
305 tgl 4509 GIC 1825 : if (XLOGbuffers == -1) /* failed to apply it? */
305 tgl 4510 UIC 0 : SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
4511 : PGC_S_OVERRIDE);
4512 : }
4460 tgl 4513 CBC 4564 : Assert(XLOGbuffers > 0);
4460 tgl 4514 ECB :
4515 : /* XLogCtl */
6441 tgl 4516 GIC 4564 : size = sizeof(XLogCtlData);
3562 heikki.linnakangas 4517 ECB :
3306 4518 : /* WAL insertion locks, plus alignment */
3112 heikki.linnakangas 4519 CBC 4564 : size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4520 : /* xlblocks array */
6441 tgl 4521 GIC 4564 : size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4522 : /* extra alignment padding for XLOG I/O buffers */
1 tmunro 4523 GNC 4564 : size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
4524 : /* and the buffers themselves */
6215 tgl 4525 GIC 4564 : size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
6441 tgl 4526 ECB :
4527 : /*
4528 : * Note: we don't count ControlFileData, it comes out of the "slop factor"
4529 : * added by CreateSharedMemoryAndSemaphores. This lets us use this
4530 : * routine again below to compute the actual allocation size.
4531 : */
4532 :
6441 tgl 4533 GIC 4564 : return size;
4534 : }
4535 :
4536 : void
8586 vadim4o 4537 1826 : XLOGShmemInit(void)
4538 : {
4539 : bool foundCFile,
4540 : foundXLog;
6441 tgl 4541 ECB : char *allocptr;
3562 heikki.linnakangas 4542 : int i;
4543 : ControlFileData *localControlFile;
4544 :
4545 : #ifdef WAL_DEBUG
4546 :
4547 : /*
4548 : * Create a memory context for WAL debugging that's exempt from the normal
4549 : * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4550 : * an allocation fails, but wal_debug is not for production use anyway.
4551 : */
4552 : if (walDebugCxt == NULL)
3205 4553 : {
4554 : walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4555 : "WAL Debug",
4556 : ALLOCSET_DEFAULT_SIZES);
4557 : MemoryContextAllowInCriticalSection(walDebugCxt, true);
4558 : }
4559 : #endif
4560 :
2030 andres 4561 EUB :
2030 andres 4562 GIC 1826 : XLogCtl = (XLogCtlData *)
2030 andres 4563 GBC 1826 : ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4564 :
2034 4565 1826 : localControlFile = ControlFile;
8170 tgl 4566 GIC 1826 : ControlFile = (ControlFileData *)
7050 bruce 4567 1826 : ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4568 :
6439 tgl 4569 1826 : if (foundCFile || foundXLog)
4570 : {
7050 bruce 4571 EUB : /* both should be present or neither */
6439 tgl 4572 UIC 0 : Assert(foundCFile && foundXLog);
3181 rhaas 4573 ECB :
4574 : /* Initialize local copy of WALInsertLocks */
3181 rhaas 4575 UIC 0 : WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4576 :
2030 andres 4577 0 : if (localControlFile)
4578 0 : pfree(localControlFile);
7050 bruce 4579 LBC 0 : return;
4580 : }
8062 tgl 4581 CBC 1826 : memset(XLogCtl, 0, sizeof(XLogCtlData));
8053 bruce 4582 EUB :
4583 : /*
2030 andres 4584 ECB : * Already have read control file locally, unless in bootstrap mode. Move
4585 : * contents into shared memory.
4586 : */
2030 andres 4587 GIC 1826 : if (localControlFile)
4588 : {
4589 911 : memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
4590 911 : pfree(localControlFile);
2030 andres 4591 ECB : }
4592 :
4593 : /*
4594 : * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4595 : * multiple of the alignment for same, so no extra alignment padding is
4596 : * needed here.
4597 : */
3562 heikki.linnakangas 4598 GIC 1826 : allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
6441 tgl 4599 CBC 1826 : XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
8062 tgl 4600 GIC 1826 : memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
6441 4601 1826 : allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4602 :
4603 :
4604 : /* WAL insertion locks. Ensure they're aligned to the full padded size */
3306 heikki.linnakangas 4605 1826 : allocptr += sizeof(WALInsertLockPadded) -
2118 tgl 4606 1826 : ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
3306 heikki.linnakangas 4607 1826 : WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4608 : (WALInsertLockPadded *) allocptr;
3112 4609 1826 : allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4610 :
4611 16434 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4612 : {
2672 rhaas 4613 14608 : LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
3306 heikki.linnakangas 4614 14608 : WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
2299 andres 4615 CBC 14608 : WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
4616 : }
3562 heikki.linnakangas 4617 ECB :
8062 tgl 4618 : /*
3562 heikki.linnakangas 4619 : * Align the start of the page buffers to a full xlog block size boundary.
3260 bruce 4620 : * This simplifies some calculations in XLOG insertion. It is also
4621 : * required for O_DIRECT.
4622 : */
3562 heikki.linnakangas 4623 GIC 1826 : allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
6441 tgl 4624 1826 : XLogCtl->pages = allocptr;
6215 4625 1826 : memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4626 :
4627 : /*
6385 bruce 4628 ECB : * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4629 : * in additional info.)
8062 tgl 4630 : */
8062 tgl 4631 GIC 1826 : XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
1080 michael 4632 1826 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
650 noah 4633 1826 : XLogCtl->InstallXLogFileSegmentActive = false;
3988 tgl 4634 1826 : XLogCtl->WalWriterSleeping = false;
4635 :
3562 heikki.linnakangas 4636 1826 : SpinLockInit(&XLogCtl->Insert.insertpos_lck);
7862 tgl 4637 CBC 1826 : SpinLockInit(&XLogCtl->info_lck);
3709 heikki.linnakangas 4638 GIC 1826 : SpinLockInit(&XLogCtl->ulsn_lck);
4639 : }
4640 :
4641 : /*
4642 : * This func must be called ONCE on system install. It creates pg_control
4643 : * and the initial XLOG segment.
4644 : */
4645 : void
8062 tgl 4646 305 : BootStrapXLOG(void)
4647 : {
4648 : CheckPoint checkPoint;
4649 : char *buffer;
4650 : XLogPageHeader page;
4651 : XLogLongPageHeader longpage;
8397 bruce 4652 ECB : XLogRecord *record;
4653 : char *recptr;
4654 : uint64 sysidentifier;
4655 : struct timeval tv;
2917 heikki.linnakangas 4656 : pg_crc32c crc;
8586 vadim4o 4657 :
4658 : /* allow ordinary WAL segment creation, like StartupXLOG() would */
235 michael 4659 GNC 305 : SetInstallXLogFileSegmentActive();
4660 :
6997 tgl 4661 ECB : /*
4662 : * Select a hopefully-unique system identifier code for this installation.
4663 : * We use the result of gettimeofday(), including the fractional seconds
6385 bruce 4664 : * field, as being about as unique as we can easily get. (Think not to
4665 : * use random(), since it hasn't been seeded and there's no portable way
4666 : * to seed it other than the system clock value...) The upper half of the
3270 tgl 4667 : * uint64 value is just the tv_sec part, while the lower half contains the
4668 : * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4669 : * PID for a little extra uniqueness. A person knowing this encoding can
4670 : * determine the initialization time of the installation, which could
4671 : * perhaps be useful sometimes.
4672 : */
6997 tgl 4673 CBC 305 : gettimeofday(&tv, NULL);
6997 tgl 4674 GIC 305 : sysidentifier = ((uint64) tv.tv_sec) << 32;
3270 4675 305 : sysidentifier |= ((uint64) tv.tv_usec) << 12;
4676 305 : sysidentifier |= getpid() & 0xFFF;
4677 :
4678 : /* page buffer must be aligned suitably for O_DIRECT */
3562 heikki.linnakangas 4679 305 : buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4680 305 : page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
6215 tgl 4681 CBC 305 : memset(page, 0, XLOG_BLCKSZ);
4682 :
4683 : /*
4684 : * Set up information for the initial checkpoint record
4541 heikki.linnakangas 4685 ECB : *
4686 : * The initial checkpoint record is written to the beginning of the WAL
4687 : * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4688 : * used, so that we can use 0/0 to mean "before any valid WAL segment".
4689 : */
2028 andres 4690 GIC 305 : checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
520 rhaas 4691 305 : checkPoint.ThisTimeLineID = BootstrapTimeLineID;
4692 305 : checkPoint.PrevTimeLineID = BootstrapTimeLineID;
4092 simon 4693 305 : checkPoint.fullPageWrites = fullPageWrites;
4694 : checkPoint.nextXid =
1473 tmunro 4695 305 : FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
633 tgl 4696 305 : checkPoint.nextOid = FirstGenbkiObjectId;
6555 4697 305 : checkPoint.nextMulti = FirstMultiXactId;
6514 4698 305 : checkPoint.nextMultiOffset = 0;
4969 4699 305 : checkPoint.oldestXid = FirstNormalTransactionId;
353 4700 305 : checkPoint.oldestXidDB = Template1DbOid;
3728 alvherre 4701 305 : checkPoint.oldestMulti = FirstMultiXactId;
353 tgl 4702 305 : checkPoint.oldestMultiDB = Template1DbOid;
2659 mail 4703 305 : checkPoint.oldestCommitTsXid = InvalidTransactionId;
4704 305 : checkPoint.newestCommitTsXid = InvalidTransactionId;
5530 tgl 4705 305 : checkPoint.time = (pg_time_t) time(NULL);
4859 simon 4706 305 : checkPoint.oldestActiveXid = InvalidTransactionId;
4707 :
971 andres 4708 305 : ShmemVariableCache->nextXid = checkPoint.nextXid;
8174 vadim4o 4709 305 : ShmemVariableCache->nextOid = checkPoint.nextOid;
8174 vadim4o 4710 CBC 305 : ShmemVariableCache->oidCount = 0;
6514 tgl 4711 305 : MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
2208 rhaas 4712 GIC 305 : AdvanceOldestClogXid(checkPoint.oldestXid);
4799 tgl 4713 CBC 305 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
2217 4714 305 : SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
3049 alvherre 4715 305 : SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
4716 :
6997 tgl 4717 ECB : /* Set up the XLOG page header */
8586 vadim4o 4718 GIC 305 : page->xlp_magic = XLOG_PAGE_MAGIC;
6836 tgl 4719 305 : page->xlp_info = XLP_LONG_HEADER;
520 rhaas 4720 GBC 305 : page->xlp_tli = BootstrapTimeLineID;
2028 andres 4721 GIC 305 : page->xlp_pageaddr = wal_segment_size;
6836 tgl 4722 305 : longpage = (XLogLongPageHeader) page;
6836 tgl 4723 GBC 305 : longpage->xlp_sysid = sysidentifier;
2028 andres 4724 GIC 305 : longpage->xlp_seg_size = wal_segment_size;
6213 tgl 4725 GBC 305 : longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
6997 tgl 4726 EUB :
4727 : /* Insert the initial checkpoint record */
3062 heikki.linnakangas 4728 GIC 305 : recptr = ((char *) page + SizeOfXLogLongPHD);
3062 heikki.linnakangas 4729 CBC 305 : record = (XLogRecord *) recptr;
3941 heikki.linnakangas 4730 GIC 305 : record->xl_prev = 0;
8586 vadim4o 4731 305 : record->xl_xid = InvalidTransactionId;
3062 heikki.linnakangas 4732 305 : record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
8062 tgl 4733 305 : record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
8586 vadim4o 4734 305 : record->xl_rmid = RM_XLOG_ID;
3062 heikki.linnakangas 4735 CBC 305 : recptr += SizeOfXLogRecord;
4736 : /* fill the XLogRecordDataHeaderShort struct */
2203 tgl 4737 305 : *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
3062 heikki.linnakangas 4738 305 : *(recptr++) = sizeof(checkPoint);
3062 heikki.linnakangas 4739 GIC 305 : memcpy(recptr, &checkPoint, sizeof(checkPoint));
4740 305 : recptr += sizeof(checkPoint);
4741 305 : Assert(recptr - (char *) record == record->xl_tot_len);
4742 :
3078 4743 305 : INIT_CRC32C(crc);
3062 4744 305 : COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
3078 4745 305 : COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
3078 heikki.linnakangas 4746 CBC 305 : FIN_CRC32C(crc);
8137 vadim4o 4747 305 : record->xl_crc = crc;
8137 vadim4o 4748 ECB :
6997 tgl 4749 : /* Create first XLOG segment file */
520 rhaas 4750 GIC 305 : openLogTLI = BootstrapTimeLineID;
4751 305 : openLogFile = XLogFileInit(1, BootstrapTimeLineID);
4752 :
1140 tgl 4753 ECB : /*
4754 : * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
4755 : * close the file again in a moment.
4756 : */
4757 :
4758 : /* Write the first page with the initial record */
7977 tgl 4759 CBC 305 : errno = 0;
2213 rhaas 4760 GIC 305 : pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
6215 tgl 4761 CBC 305 : if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
7977 tgl 4762 ECB : {
4763 : /* if write didn't set errno, assume problem is no disk space */
7977 tgl 4764 UIC 0 : if (errno == 0)
4765 0 : errno = ENOSPC;
7202 4766 0 : ereport(PANIC,
4767 : (errcode_for_file_access(),
4768 : errmsg("could not write bootstrap write-ahead log file: %m")));
4769 : }
2213 rhaas 4770 GIC 305 : pgstat_report_wait_end();
8586 vadim4o 4771 ECB :
2213 rhaas 4772 CBC 305 : pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
8062 tgl 4773 305 : if (pg_fsync(openLogFile) != 0)
7202 tgl 4774 UIC 0 : ereport(PANIC,
4775 : (errcode_for_file_access(),
4776 : errmsg("could not fsync bootstrap write-ahead log file: %m")));
2213 rhaas 4777 GIC 305 : pgstat_report_wait_end();
4778 :
1373 peter 4779 CBC 305 : if (close(openLogFile) != 0)
7013 tgl 4780 LBC 0 : ereport(PANIC,
7013 tgl 4781 ECB : (errcode_for_file_access(),
2118 4782 : errmsg("could not close bootstrap write-ahead log file: %m")));
4783 :
8062 tgl 4784 CBC 305 : openLogFile = -1;
8586 vadim4o 4785 ECB :
6997 tgl 4786 : /* Now create pg_control */
1147 peter 4787 GIC 305 : InitControlFile(sysidentifier);
8062 tgl 4788 305 : ControlFile->time = checkPoint.time;
8586 vadim4o 4789 305 : ControlFile->checkPoint = checkPoint.redo;
8062 tgl 4790 305 : ControlFile->checkPointCopy = checkPoint;
4791 :
4792 : /* some additional ControlFile fields are set in WriteControlFile() */
8170 4793 305 : WriteControlFile();
7897 tgl 4794 ECB :
4795 : /* Bootstrap the commit log, too */
7897 tgl 4796 GIC 305 : BootStrapCLOG();
3049 alvherre 4797 305 : BootStrapCommitTs();
6856 tgl 4798 305 : BootStrapSUBTRANS();
6555 4799 305 : BootStrapMultiXact();
4800 :
6441 4801 305 : pfree(buffer);
4802 :
4803 : /*
4804 : * Force control file to be read - in contrast to normal processing we'd
4805 : * otherwise never run the checks and GUC related initializations therein.
4806 : */
2034 andres 4807 CBC 305 : ReadControlFile();
8586 vadim4o 4808 GIC 305 : }
4809 :
4810 : static char *
5727 tgl 4811 566 : str_time(pg_time_t tnow)
4812 : {
4813 : static char buf[128];
4814 :
4815 566 : pg_strftime(buf, sizeof(buf),
4816 : "%Y-%m-%d %H:%M:%S %Z",
4817 566 : pg_localtime(&tnow, log_timezone));
4818 :
8174 peter_e 4819 566 : return buf;
4820 : }
8586 vadim4o 4821 ECB :
6838 tgl 4822 : /*
417 heikki.linnakangas 4823 : * Initialize the first WAL segment on new timeline.
6838 tgl 4824 : */
4825 : static void
417 heikki.linnakangas 4826 GIC 39 : XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
6838 tgl 4827 ECB : {
3090 fujii 4828 : char xlogfname[MAXFNAMELEN];
3034 heikki.linnakangas 4829 : XLogSegNo endLogSegNo;
4830 : XLogSegNo startLogSegNo;
4831 :
4832 : /* we always switch to a new timeline after archive recovery */
520 rhaas 4833 GIC 39 : Assert(endTLI != newTLI);
4834 :
4835 : /*
4836 : * Update min recovery point one last time.
4837 : */
5036 heikki.linnakangas 4838 CBC 39 : UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5036 heikki.linnakangas 4839 ECB :
6838 tgl 4840 : /*
3034 heikki.linnakangas 4841 : * Calculate the last segment on the old timeline, and the first segment
4842 : * on the new timeline. If the switch happens in the middle of a segment,
4843 : * they are the same, but if the switch happens exactly at a segment
4844 : * boundary, startLogSegNo will be endLogSegNo + 1.
4845 : */
2028 andres 4846 CBC 39 : XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
4847 39 : XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
3034 heikki.linnakangas 4848 ECB :
4849 : /*
4850 : * Initialize the starting WAL segment for the new timeline. If the switch
4851 : * happens in the middle of a segment, copy data from the last WAL segment
4852 : * of the old timeline up to the switch point, to the starting WAL segment
4853 : * on the new timeline.
6838 tgl 4854 : */
3034 heikki.linnakangas 4855 GIC 39 : if (endLogSegNo == startLogSegNo)
6838 tgl 4856 ECB : {
2893 heikki.linnakangas 4857 : /*
4858 : * Make a copy of the file on the new timeline.
4859 : *
4860 : * Writing WAL isn't allowed yet, so there are no locking
4861 : * considerations. But we should be just as tense as XLogFileInit to
4862 : * avoid emplacing a bogus file.
4863 : */
520 rhaas 4864 GIC 30 : XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
2028 andres 4865 30 : XLogSegmentOffset(endOfLog, wal_segment_size));
6838 tgl 4866 ECB : }
3034 heikki.linnakangas 4867 : else
4868 : {
2893 4869 : /*
4870 : * The switch happened at a segment boundary, so just create the next
4871 : * segment on the new timeline.
4872 : */
3031 4873 : int fd;
4874 :
520 rhaas 4875 GIC 9 : fd = XLogFileInit(startLogSegNo, newTLI);
3031 heikki.linnakangas 4876 ECB :
1373 peter 4877 CBC 9 : if (close(fd) != 0)
1223 michael 4878 ECB : {
1223 michael 4879 LBC 0 : int save_errno = errno;
1223 michael 4880 ECB :
520 rhaas 4881 LBC 0 : XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
1223 michael 4882 0 : errno = save_errno;
3031 heikki.linnakangas 4883 UIC 0 : ereport(ERROR,
3031 heikki.linnakangas 4884 ECB : (errcode_for_file_access(),
1223 michael 4885 : errmsg("could not close file \"%s\": %m", xlogfname)));
4886 : }
3034 heikki.linnakangas 4887 : }
6838 tgl 4888 :
4889 : /*
6385 bruce 4890 : * Let's just make real sure there are not .ready or .done flags posted
4891 : * for the new segment.
6838 tgl 4892 : */
520 rhaas 4893 CBC 39 : XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
3090 fujii 4894 39 : XLogArchiveCleanup(xlogfname);
6838 tgl 4895 GIC 39 : }
4896 :
543 rhaas 4897 ECB : /*
4898 : * Perform cleanup actions at the conclusion of archive recovery.
4899 : */
4900 : static void
520 rhaas 4901 GIC 39 : CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
4902 : TimeLineID newTLI)
4903 : {
4904 : /*
4905 : * Execute the recovery_end_command, if any.
543 rhaas 4906 ECB : */
543 rhaas 4907 CBC 39 : if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
62 michael 4908 2 : ExecuteRecoveryCommand(recoveryEndCommand,
4909 : "recovery_end_command",
4910 : true,
62 michael 4911 EUB : WAIT_EVENT_RECOVERY_END_COMMAND);
543 rhaas 4912 :
4913 : /*
4914 : * We switched to a new timeline. Clean up segments on the old timeline.
4915 : *
4916 : * If there are any higher-numbered segments on the old timeline, remove
417 heikki.linnakangas 4917 ECB : * them. They might contain valid WAL, but they might also be
4918 : * pre-allocated files containing garbage. In any case, they are not part
4919 : * of the new timeline's history so we don't need them.
543 rhaas 4920 : */
520 rhaas 4921 GBC 39 : RemoveNonParentXlogFiles(EndOfLog, newTLI);
4922 :
4923 : /*
543 rhaas 4924 ECB : * If the switch happened in the middle of a segment, what to do with the
4925 : * last, partial segment on the old timeline? If we don't archive it, and
417 heikki.linnakangas 4926 : * the server that created the WAL never archives it either (e.g. because
417 heikki.linnakangas 4927 EUB : * it was hit by a meteor), it will never make it to the archive. That's
4928 : * OK from our point of view, because the new segment that we created with
4929 : * the new TLI contains all the WAL from the old timeline up to the switch
4930 : * point. But if you later try to do PITR to the "missing" WAL on the old
417 heikki.linnakangas 4931 ECB : * timeline, recovery won't find it in the archive. It's physically
4932 : * present in the new file with new TLI, but recovery won't look there
4933 : * when it's recovering to the older timeline. On the other hand, if we
4934 : * archive the partial segment, and the original server on that timeline
4935 : * is still running and archives the completed version of the same segment
4936 : * later, it will fail. (We used to do that in 9.4 and below, and it
4937 : * caused such problems).
4938 : *
4939 : * As a compromise, we rename the last segment with the .partial suffix,
4940 : * and archive it. Archive recovery will never try to read .partial
4941 : * segments, so they will normally go unused. But in the odd PITR case,
4942 : * the administrator can copy them manually to the pg_wal directory
4943 : * (removing the suffix). They can be useful in debugging, too.
543 rhaas 4944 : *
4945 : * If a .done or .ready file already exists for the old timeline, however,
417 heikki.linnakangas 4946 : * we had already determined that the segment is complete, so we can let
4947 : * it be archived normally. (In particular, if it was restored from the
4948 : * archive to begin with, it's expected to have a .done file).
4949 : */
543 rhaas 4950 GIC 39 : if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
4951 30 : XLogArchivingActive())
4952 : {
4953 : char origfname[MAXFNAMELEN];
543 rhaas 4954 ECB : XLogSegNo endLogSegNo;
4955 :
543 rhaas 4956 GIC 6 : XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
4957 6 : XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
543 rhaas 4958 ECB :
543 rhaas 4959 GIC 6 : if (!XLogArchiveIsReadyOrDone(origfname))
4960 : {
4961 : char origpath[MAXPGPATH];
543 rhaas 4962 ECB : char partialfname[MAXFNAMELEN];
4963 : char partialpath[MAXPGPATH];
4964 :
543 rhaas 4965 GIC 4 : XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
543 rhaas 4966 CBC 4 : snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
543 rhaas 4967 GIC 4 : snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
4968 :
4969 : /*
4970 : * Make sure there's no .done or .ready file for the .partial
4971 : * file.
4972 : */
543 rhaas 4973 CBC 4 : XLogArchiveCleanup(partialfname);
4974 :
543 rhaas 4975 GIC 4 : durable_rename(origpath, partialpath, ERROR);
4976 4 : XLogArchiveNotify(partialfname);
4977 : }
4978 : }
4979 39 : }
543 rhaas 4980 ECB :
4981 : /*
4982 : * Check to see if required parameters are set high enough on this server
4983 : * for various aspects of recovery operation.
4984 : *
417 heikki.linnakangas 4985 : * Note that all the parameters which this function tests need to be
4986 : * listed in Administrator's Overview section in high-availability.sgml.
4987 : * If you change them, don't forget to update the list.
4988 : */
4989 : static void
417 heikki.linnakangas 4990 GIC 170 : CheckRequiredParameterValues(void)
4991 : {
4992 : /*
417 heikki.linnakangas 4993 ECB : * For archive recovery, the WAL must be generated with at least 'replica'
4994 : * wal_level.
4995 : */
417 heikki.linnakangas 4996 GIC 170 : if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
4997 : {
4998 2 : ereport(FATAL,
4999 : (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
5000 : errdetail("This happens if you temporarily set wal_level=minimal on the server."),
5001 : errhint("Use a backup taken after setting wal_level to higher than minimal.")));
3377 heikki.linnakangas 5002 ECB : }
5003 :
5004 : /*
5005 : * For Hot Standby, the WAL must be generated with 'replica' mode, and we
5006 : * must have at least as many backend slots as the primary.
5007 : */
3322 heikki.linnakangas 5008 GIC 168 : if (ArchiveRecoveryRequested && EnableHotStandby)
5009 : {
5010 : /* We ignore autovacuum_max_workers when we make this test. */
4729 heikki.linnakangas 5011 CBC 75 : RecoveryRequiresIntParameter("max_connections",
4728 tgl 5012 ECB : MaxConnections,
4728 tgl 5013 GIC 75 : ControlFile->MaxConnections);
3566 rhaas 5014 75 : RecoveryRequiresIntParameter("max_worker_processes",
5015 : max_worker_processes,
5016 75 : ControlFile->max_worker_processes);
1517 michael 5017 75 : RecoveryRequiresIntParameter("max_wal_senders",
5018 : max_wal_senders,
5019 75 : ControlFile->max_wal_senders);
3868 tgl 5020 75 : RecoveryRequiresIntParameter("max_prepared_transactions",
5021 : max_prepared_xacts,
4728 tgl 5022 CBC 75 : ControlFile->max_prepared_xacts);
3868 tgl 5023 GIC 75 : RecoveryRequiresIntParameter("max_locks_per_transaction",
4728 tgl 5024 ECB : max_locks_per_xact,
4728 tgl 5025 GIC 75 : ControlFile->max_locks_per_xact);
4729 heikki.linnakangas 5026 EUB : }
4859 simon 5027 GIC 168 : }
4859 simon 5028 EUB :
8586 vadim4o 5029 : /*
8062 tgl 5030 : * This must be called ONCE during postmaster or standalone-backend startup
5031 : */
5032 : void
8062 tgl 5033 GIC 1176 : StartupXLOG(void)
5034 : {
5035 : XLogCtlInsert *Insert;
5036 : CheckPoint checkPoint;
5037 : bool wasShutdown;
5038 : bool didCrash;
5039 : bool haveTblspcMap;
417 heikki.linnakangas 5040 ECB : bool haveBackupLabel;
5041 : XLogRecPtr EndOfLog;
2880 5042 : TimeLineID EndOfLogTLI;
5043 : TimeLineID newTLI;
5044 : bool performedWalRecovery;
5045 : EndOfWalRecoveryInfo *endOfRecoveryInfo;
5046 : XLogRecPtr abortedRecPtr;
5047 : XLogRecPtr missingContrecPtr;
6505 tgl 5048 : TransactionId oldestActiveXID;
984 fujii 5049 GIC 1176 : bool promoted = false;
5050 :
5051 : /*
5052 : * We should have an aux process resource owner to use, and we should not
5053 : * be in a transaction that's installed some other resowner.
1726 tgl 5054 ECB : */
1726 tgl 5055 CBC 1176 : Assert(AuxProcessResourceOwner != NULL);
1726 tgl 5056 GIC 1176 : Assert(CurrentResourceOwner == NULL ||
5057 : CurrentResourceOwner == AuxProcessResourceOwner);
5058 1176 : CurrentResourceOwner = AuxProcessResourceOwner;
5059 :
5060 : /*
5061 : * Check that contents look valid.
5062 : */
1248 peter 5063 1176 : if (!XRecOffIsValid(ControlFile->checkPoint))
7202 tgl 5064 UIC 0 : ereport(FATAL,
5065 : (errmsg("control file contains invalid checkpoint location")));
5066 :
1248 peter 5067 GIC 1176 : switch (ControlFile->state)
3587 tgl 5068 ECB : {
1248 peter 5069 GIC 1031 : case DB_SHUTDOWNED:
5070 :
5071 : /*
5072 : * This is the expected case, so don't be chatty in standalone
5073 : * mode
5074 : */
5075 1031 : ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5076 : (errmsg("database system was shut down at %s",
5077 : str_time(ControlFile->time))));
5078 1031 : break;
5079 :
5080 14 : case DB_SHUTDOWNED_IN_RECOVERY:
5081 14 : ereport(LOG,
5082 : (errmsg("database system was shut down in recovery at %s",
5083 : str_time(ControlFile->time))));
5084 14 : break;
5085 :
1248 peter 5086 UIC 0 : case DB_SHUTDOWNING:
5087 0 : ereport(LOG,
5088 : (errmsg("database system shutdown was interrupted; last known up at %s",
5089 : str_time(ControlFile->time))));
5090 0 : break;
5091 :
5092 0 : case DB_IN_CRASH_RECOVERY:
5093 0 : ereport(LOG,
5094 : (errmsg("database system was interrupted while in recovery at %s",
5095 : str_time(ControlFile->time)),
5096 : errhint("This probably means that some data is corrupted and"
1248 peter 5097 ECB : " you will have to use the last backup for recovery.")));
1248 peter 5098 LBC 0 : break;
5099 :
1248 peter 5100 GIC 4 : case DB_IN_ARCHIVE_RECOVERY:
5101 4 : ereport(LOG,
5102 : (errmsg("database system was interrupted while in recovery at log time %s",
1248 peter 5103 ECB : str_time(ControlFile->checkPointCopy.time)),
5104 : errhint("If this has occurred more than once some data might be corrupted"
5105 : " and you might need to choose an earlier recovery target.")));
1248 peter 5106 CBC 4 : break;
5107 :
1248 peter 5108 GIC 127 : case DB_IN_PRODUCTION:
5109 127 : ereport(LOG,
5110 : (errmsg("database system was interrupted; last known up at %s",
5111 : str_time(ControlFile->time))));
1248 peter 5112 CBC 127 : break;
1248 peter 5113 ECB :
1248 peter 5114 LBC 0 : default:
1248 peter 5115 UIC 0 : ereport(FATAL,
5116 : (errmsg("control file contains invalid database cluster state")));
5117 : }
5118 :
5119 : /* This is just to allow attaching to startup process with a debugger */
7352 tgl 5120 ECB : #ifdef XLOG_REPLAY_DELAY
5121 : if (ControlFile->state != DB_SHUTDOWNED)
6929 bruce 5122 : pg_usleep(60000000L);
7352 tgl 5123 : #endif
5124 :
5125 : /*
2362 rhaas 5126 : * Verify that pg_wal and pg_wal/archive_status exist. In cases where
5127 : * someone has performed a copy for PITR, these directories may have been
5128 : * excluded and need to be re-created.
5129 : */
5264 tgl 5130 GIC 1176 : ValidateXLOGDirectoryStructure();
5131 :
5132 : /* Set up timeout handler needed to report startup progress. */
531 rhaas 5133 1176 : if (!IsBootstrapProcessingMode())
5134 871 : RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
5135 : startup_progress_timeout_handler);
5136 :
1731 michael 5137 ECB : /*----------
5138 : * If we previously crashed, perform a couple of actions:
5139 : *
5140 : * - The pg_wal directory may still include some temporary WAL segments
5141 : * used when creating a new segment, so perform some clean up to not
5142 : * bloat this path. This is done first as there is no point to sync
1147 peter 5143 : * this temporary data.
5144 : *
5145 : * - There might be data which we had written, intending to fsync it, but
5146 : * which we had not actually fsync'd yet. Therefore, a power failure in
5147 : * the near future might cause earlier unflushed writes to be lost, even
5148 : * though more recent data written to disk from here on would be
5149 : * persisted. To avoid that, fsync the entire data directory.
5150 : */
417 heikki.linnakangas 5151 GIC 1176 : if (ControlFile->state != DB_SHUTDOWNED &&
5152 145 : ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
5153 : {
5154 131 : RemoveTempXlogFiles();
417 heikki.linnakangas 5155 CBC 131 : SyncDataDirectory();
368 andres 5156 GIC 131 : didCrash = true;
5157 : }
368 andres 5158 ECB : else
368 andres 5159 GIC 1045 : didCrash = false;
417 heikki.linnakangas 5160 ECB :
5161 : /*
5162 : * Prepare for WAL recovery if needed.
5163 : *
5164 : * InitWalRecovery analyzes the control file and the backup label file, if
5165 : * any. It updates the in-memory ControlFile buffer according to the
5166 : * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
5167 : * It also applies the tablespace map file, if any.
5168 : */
417 heikki.linnakangas 5169 CBC 1176 : InitWalRecovery(ControlFile, &wasShutdown,
417 heikki.linnakangas 5170 ECB : &haveBackupLabel, &haveTblspcMap);
417 heikki.linnakangas 5171 GIC 1176 : checkPoint = ControlFile->checkPointCopy;
8586 vadim4o 5172 ECB :
5173 : /* initialize shared memory variables from the checkpoint record */
971 andres 5174 CBC 1176 : ShmemVariableCache->nextXid = checkPoint.nextXid;
8586 vadim4o 5175 GIC 1176 : ShmemVariableCache->nextOid = checkPoint.nextOid;
8192 5176 1176 : ShmemVariableCache->oidCount = 0;
6514 tgl 5177 1176 : MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
2208 rhaas 5178 1176 : AdvanceOldestClogXid(checkPoint.oldestXid);
4799 tgl 5179 1176 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
2217 tgl 5180 CBC 1176 : SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
2659 mail 5181 GIC 1176 : SetCommitTsLimit(checkPoint.oldestCommitTsXid,
5182 : checkPoint.newestCommitTsXid);
971 andres 5183 1176 : XLogCtl->ckptFullXid = checkPoint.nextXid;
5184 :
5185 : /*
5186 : * Clear out any old relcache cache files. This is *necessary* if we do
5187 : * any WAL replay, since that would probably result in the cache files
5188 : * being out of sync with database reality. In theory we could leave them
5189 : * in place if the database had been cleanly shut down, but it seems
5190 : * safest to just remove them always and let them be rebuilt during the
5191 : * first backend startup. These files needs to be removed from all
5192 : * directories including pg_tblspc, however the symlinks are created only
5193 : * after reading tablespace_map file in case of archive recovery from
5194 : * backup, so needs to clear old relcache files here after creating
5195 : * symlinks.
417 heikki.linnakangas 5196 ECB : */
417 heikki.linnakangas 5197 GIC 1176 : RelationCacheInitFileRemove();
5198 :
5199 : /*
5200 : * Initialize replication slots, before there's a chance to remove
5201 : * required resources.
3355 rhaas 5202 ECB : */
3223 andres 5203 CBC 1176 : StartupReplicationSlots();
5204 :
3324 rhaas 5205 ECB : /*
5206 : * Startup logical state, needs to be setup now so we have proper data
5207 : * during crash recovery.
5208 : */
3324 rhaas 5209 GIC 1176 : StartupReorderBuffer();
3324 rhaas 5210 ECB :
802 rhaas 5211 EUB : /*
5212 : * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
5213 : * been initialized and before we accept connections or begin WAL replay.
802 rhaas 5214 ECB : */
802 rhaas 5215 GIC 1176 : StartupCLOG();
802 rhaas 5216 ECB :
5217 : /*
5218 : * Startup MultiXact. We need to do this early to be able to replay
5219 : * truncations.
5220 : */
3418 alvherre 5221 GIC 1176 : StartupMultiXact();
3418 alvherre 5222 ECB :
5223 : /*
5224 : * Ditto for commit timestamps. Activate the facility if the setting is
1656 michael 5225 : * enabled in the control file, as there should be no tracking of commit
5226 : * timestamps done when the setting was disabled. This facility can be
5227 : * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
2676 alvherre 5228 : */
1656 michael 5229 GIC 1176 : if (ControlFile->track_commit_timestamp)
2676 alvherre 5230 8 : StartupCommitTs();
2676 alvherre 5231 ECB :
5232 : /*
2902 andres 5233 EUB : * Recover knowledge about replay progress of known replication partners.
5234 : */
2902 andres 5235 GIC 1176 : StartupReplicationOrigin();
5236 :
3709 heikki.linnakangas 5237 EUB : /*
5238 : * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5239 : * control file. On recovery, all unlogged relations are blown away, so
5240 : * the unlogged LSN counter can be reset too.
5241 : */
3709 heikki.linnakangas 5242 GIC 1176 : if (ControlFile->state == DB_SHUTDOWNED)
5243 1025 : XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
5244 : else
1260 michael 5245 GBC 151 : XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
5246 :
3728 heikki.linnakangas 5247 ECB : /*
3602 bruce 5248 : * Copy any missing timeline history files between 'now' and the recovery
5249 : * target timeline from archive to pg_wal. While we don't need those files
5250 : * ourselves - the history file of the recovery target timeline covers all
5251 : * the previous timelines in the history too - a cascading standby server
5252 : * might be interested in them. Or, if you archive the WAL from this
697 tgl 5253 : * server to a different archive than the primary, it'd be good for all
5254 : * the history files to get archived there after failover, so that you can
5255 : * use one of the old timelines as a PITR target. Timeline history files
5256 : * are small, so it's better to copy them unnecessarily than not copy them
5257 : * and regret later.
5258 : */
417 heikki.linnakangas 5259 CBC 1176 : restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
5260 :
2196 simon 5261 EUB : /*
2153 bruce 5262 : * Before running in recovery, scan pg_twophase and fill in its status to
5263 : * be able to work on entries generated by redo. Doing a scan before
5264 : * taking any recovery action has the merit to discard any 2PC files that
5265 : * are newer than the first record to replay, saving from any conflicts at
5266 : * replay. This avoids as well any subsequent scans when doing recovery
5267 : * of the on-disk two-phase data.
5268 : */
2196 simon 5269 GIC 1176 : restoreTwoPhaseData();
5270 :
5271 : /*
5272 : * When starting with crash recovery, reset pgstat data - it might not be
5273 : * valid. Otherwise restore pgstat data. It's safe to do this here,
5274 : * because postmaster will not yet have started any other processes.
5275 : *
5276 : * NB: Restoring replication slot stats relies on slot state to have
368 andres 5277 ECB : * already been restored from disk.
5278 : *
5279 : * TODO: With a bit of extra work we could just start with a pgstat file
5280 : * associated with the checkpoint redo location we're starting from.
5281 : */
368 andres 5282 GIC 1176 : if (didCrash)
5283 131 : pgstat_discard_stats();
5284 : else
5285 1045 : pgstat_restore_stats();
5286 :
4092 simon 5287 1176 : lastFullPageWrites = checkPoint.fullPageWrites;
5288 :
3562 heikki.linnakangas 5289 1176 : RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
3076 5290 1176 : doPageWrites = lastFullPageWrites;
5291 :
5292 : /* REDO */
6822 tgl 5293 1176 : if (InRecovery)
5294 : {
5295 : /* Initialize state for RecoveryInProgress() */
417 heikki.linnakangas 5296 151 : SpinLockAcquire(&XLogCtl->info_lck);
5297 151 : if (InArchiveRecovery)
417 heikki.linnakangas 5298 CBC 73 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
417 heikki.linnakangas 5299 ECB : else
417 heikki.linnakangas 5300 GIC 78 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
417 heikki.linnakangas 5301 CBC 151 : SpinLockRelease(&XLogCtl->info_lck);
417 heikki.linnakangas 5302 ECB :
6089 tgl 5303 : /*
5304 : * Update pg_control to show that we are recovering and to show the
5305 : * selected checkpoint as the place we are starting from. We also mark
6031 bruce 5306 : * pg_control with any minimum recovery stop point obtained from a
5307 : * backup history file.
5308 : *
5309 : * No need to hold ControlFileLock yet, we aren't up far enough.
5310 : */
417 heikki.linnakangas 5311 GIC 151 : UpdateControlFile();
5312 :
5313 : /*
5314 : * If there was a backup label file, it's done its job and the info
5315 : * has now been propagated into pg_control. We must get rid of the
417 heikki.linnakangas 5316 ECB : * label file so that if we crash during recovery, we'll pick up at
5317 : * the latest recovery restartpoint instead of going all the way back
5318 : * to the backup start point. It seems prudent though to just rename
5319 : * the file out of the way rather than delete it completely.
5320 : */
417 heikki.linnakangas 5321 CBC 151 : if (haveBackupLabel)
3698 heikki.linnakangas 5322 ECB : {
417 heikki.linnakangas 5323 CBC 51 : unlink(BACKUP_LABEL_OLD);
5324 51 : durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
3698 heikki.linnakangas 5325 ECB : }
4790 bruce 5326 :
4843 heikki.linnakangas 5327 : /*
417 5328 : * If there was a tablespace_map file, it's done its job and the
5329 : * symlinks have been created. We must get rid of the map file so
5330 : * that if we crash during recovery, we don't create symlinks again.
5331 : * It seems prudent though to just rename the file out of the way
5332 : * rather than delete it completely.
5333 : */
417 heikki.linnakangas 5334 GIC 151 : if (haveTblspcMap)
5335 : {
5336 1 : unlink(TABLESPACE_MAP_OLD);
5337 1 : durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
5338 : }
5339 :
5340 : /*
5341 : * Initialize our local copy of minRecoveryPoint. When doing crash
5342 : * recovery we want to replay up to the end of WAL. Particularly, in
5343 : * the case of a promoted standby minRecoveryPoint value in the
1739 michael 5344 ECB : * control file is only updated after the first checkpoint. However,
5345 : * if the instance crashes before the first post-recovery checkpoint
5346 : * is completed then recovery will use a stale location causing the
5347 : * startup process to think that there are still invalid page
5348 : * references when checking for data consistency.
5349 : */
1739 michael 5350 CBC 151 : if (InArchiveRecovery)
5351 : {
417 heikki.linnakangas 5352 GIC 73 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
5353 73 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5354 : }
5355 : else
1739 michael 5356 ECB : {
417 heikki.linnakangas 5357 GIC 78 : LocalMinRecoveryPoint = InvalidXLogRecPtr;
5358 78 : LocalMinRecoveryPointTLI = 0;
5359 : }
5360 :
5361 : /* Check that the GUCs used to generate the WAL allow recovery */
4729 heikki.linnakangas 5362 CBC 151 : CheckRequiredParameterValues();
5363 :
5364 : /*
5365 : * We're in recovery, so unlogged relations may be trashed and must be
5366 : * reset. This should be done BEFORE allowing Hot Standby
5367 : * connections, so that read-only backends don't try to read whatever
3955 bruce 5368 ECB : * garbage is left over from before.
5369 : */
4484 rhaas 5370 GIC 151 : ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5371 :
5372 : /*
5373 : * Likewise, delete any saved transaction snapshot files that got left
5374 : * behind by crashed backends.
5375 : */
4187 tgl 5376 CBC 151 : DeleteAllExportedSnapshotFiles();
4187 tgl 5377 ECB :
5378 : /*
5379 : * Initialize for Hot Standby, if enabled. We won't let backends in
5380 : * yet, not until we've reached the min recovery point specified in
5381 : * control file and we've established a recovery snapshot from a
4859 simon 5382 : * running-xacts WAL record.
5383 : */
3698 heikki.linnakangas 5384 GIC 151 : if (ArchiveRecoveryRequested && EnableHotStandby)
5385 : {
5386 : TransactionId *xids;
5387 : int nxids;
5388 :
4804 heikki.linnakangas 5389 CBC 71 : ereport(DEBUG1,
781 peter 5390 ECB : (errmsg_internal("initializing for hot standby")));
5391 :
4859 simon 5392 CBC 71 : InitRecoveryTransactionEnvironment();
5393 :
4859 simon 5394 GIC 71 : if (wasShutdown)
5395 12 : oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5396 : else
5397 59 : oldestActiveXID = checkPoint.oldestActiveXid;
5398 71 : Assert(TransactionIdIsValid(oldestActiveXID));
5399 :
5400 : /* Tell procarray about the range of xids it has to deal with */
971 andres 5401 71 : ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
5402 :
5403 : /*
5404 : * Startup subtrans only. CLOG, MultiXact and commit timestamp
5405 : * have already been started up and other SLRUs are not maintained
697 tgl 5406 ECB : * during recovery and need not be started yet.
5407 : */
4859 simon 5408 GIC 71 : StartupSUBTRANS(oldestActiveXID);
5409 :
5410 : /*
5411 : * If we're beginning at a shutdown checkpoint, we know that
5412 : * nothing was running on the primary at this point. So fake-up an
5413 : * empty running-xacts record and use that here and now. Recover
5414 : * additional standby state for prepared transactions.
5415 : */
4744 heikki.linnakangas 5416 CBC 71 : if (wasShutdown)
5417 : {
5418 : RunningTransactionsData running;
5419 : TransactionId latestCompletedXid;
5420 :
5421 : /*
5422 : * Construct a RunningTransactions snapshot representing a
5423 : * shut down server, with only prepared transactions still
5424 : * alive. We're never overflowed at this point because all
5425 : * subxids are listed with their parent prepared transactions.
5426 : */
4744 heikki.linnakangas 5427 GIC 12 : running.xcnt = nxids;
3780 simon 5428 12 : running.subxcnt = 0;
4744 heikki.linnakangas 5429 CBC 12 : running.subxid_overflow = false;
971 andres 5430 12 : running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
4744 heikki.linnakangas 5431 GIC 12 : running.oldestRunningXid = oldestActiveXID;
971 andres 5432 CBC 12 : latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
4714 simon 5433 GIC 12 : TransactionIdRetreat(latestCompletedXid);
4713 simon 5434 CBC 12 : Assert(TransactionIdIsNormal(latestCompletedXid));
4714 simon 5435 GIC 12 : running.latestCompletedXid = latestCompletedXid;
4744 heikki.linnakangas 5436 CBC 12 : running.xids = xids;
4744 heikki.linnakangas 5437 ECB :
4744 heikki.linnakangas 5438 GIC 12 : ProcArrayApplyRecoveryInfo(&running);
5439 :
2173 simon 5440 CBC 12 : StandbyRecoverPreparedTransactions();
5441 : }
5442 : }
4859 simon 5443 ECB :
1166 peter 5444 : /*
417 heikki.linnakangas 5445 : * We're all set for replaying the WAL now. Do it.
5446 : */
417 heikki.linnakangas 5447 CBC 151 : PerformWalRecovery();
5448 117 : performedWalRecovery = true;
5449 : }
5450 : else
413 heikki.linnakangas 5451 GIC 1025 : performedWalRecovery = false;
5452 :
5453 : /*
5454 : * Finish WAL recovery.
5455 : */
417 5456 1142 : endOfRecoveryInfo = FinishWalRecovery();
5457 1142 : EndOfLog = endOfRecoveryInfo->endOfLog;
417 heikki.linnakangas 5458 CBC 1142 : EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
417 heikki.linnakangas 5459 GIC 1142 : abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
5460 1142 : missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
5461 :
5462 : /*
5463 : * Reset ps status display, so as no information related to recovery
5464 : * shows up.
5465 : */
199 michael 5466 1142 : set_ps_display("");
5467 :
6822 tgl 5468 ECB : /*
5469 : * When recovering from a backup (we are in recovery, and archive recovery
368 sfrost 5470 : * was requested), complain if we did not roll forward far enough to reach
5471 : * the point where the database is consistent. For regular online
5472 : * backup-from-primary, that means reaching the end-of-backup WAL record
5473 : * (at which point we reset backupStartPoint to be Invalid), for
5474 : * backup-from-replica (which can't inject records into the WAL stream),
5475 : * that point is when we reach the minRecoveryPoint in pg_control (which
5476 : * we purposefully copy last when backing up from a replica). For
5477 : * pg_rewind (which creates a backup_label with a method of "pg_rewind")
5478 : * or snapshot-style backups (which don't), backupEndRequired will be set
5479 : * to false.
5480 : *
5481 : * Note: it is indeed okay to look at the local variable
5482 : * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
417 heikki.linnakangas 5483 : * might be further ahead --- ControlFile->minRecoveryPoint cannot have
5484 : * been advanced beyond the WAL we processed.
5485 : */
4393 heikki.linnakangas 5486 GIC 1142 : if (InRecovery &&
417 5487 117 : (EndOfLog < LocalMinRecoveryPoint ||
4843 5488 117 : !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
5489 : {
5490 : /*
5491 : * Ran off end of WAL before reaching end-of-backup WAL record, or
5492 : * minRecoveryPoint. That's a bad sign, indicating that you tried to
5493 : * recover from an online backup but never called pg_backup_stop(), or
5494 : * you didn't archive all the WAL needed.
5495 : */
3698 heikki.linnakangas 5496 UIC 0 : if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
4379 heikki.linnakangas 5497 ECB : {
368 sfrost 5498 UIC 0 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
4260 heikki.linnakangas 5499 LBC 0 : ereport(FATAL,
4260 heikki.linnakangas 5500 ECB : (errmsg("WAL ends before end of online backup"),
5501 : errhint("All WAL generated while online backup was taken must be available at recovery.")));
5502 : else
4379 heikki.linnakangas 5503 UIC 0 : ereport(FATAL,
2118 tgl 5504 ECB : (errmsg("WAL ends before consistent recovery point")));
4379 heikki.linnakangas 5505 : }
5506 : }
5507 :
5508 : /*
417 5509 : * Reset unlogged relations to the contents of their INIT fork. This is
5510 : * done AFTER recovery is complete so as to include any unlogged relations
5511 : * created during recovery, but BEFORE recovery is marked as having
5512 : * completed successfully. Otherwise we'd not retry if any of the post
5513 : * end-of-recovery steps fail.
5514 : */
417 heikki.linnakangas 5515 GIC 1142 : if (InRecovery)
5516 117 : ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
417 heikki.linnakangas 5517 ECB :
5518 : /*
5519 : * Pre-scan prepared transactions to find out the range of XIDs present.
5520 : * This information is not quite needed yet, but it is positioned here so
5521 : * as potential problems are detected before any on-disk change is done.
5522 : */
1735 michael 5523 CBC 1142 : oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
5524 :
5525 : /*
5526 : * Allow ordinary WAL segment creation before possibly switching to a new
5527 : * timeline, which creates a new segment, and after the last ReadRecord().
5528 : */
235 michael 5529 GNC 1142 : SetInstallXLogFileSegmentActive();
5530 :
5531 : /*
5532 : * Consider whether we need to assign a new timeline ID.
5533 : *
417 heikki.linnakangas 5534 ECB : * If we did archive recovery, we always assign a new ID. This handles a
5535 : * couple of issues. If we stopped short of the end of WAL during
5536 : * recovery, then we are clearly generating a new timeline and must assign
5537 : * it a unique new ID. Even if we ran to the end, modifying the current
5538 : * last segment is problematic because it may result in trying to
5624 bruce 5539 : * overwrite an already-archived copy of that segment, and we encourage
5671 tgl 5540 : * DBAs to make their archive_commands reject that. We can dodge the
5541 : * problem by making the new active segment have a new timeline ID.
5542 : *
5543 : * In a normal crash recovery, we can just extend the timeline we were in.
5544 : */
417 heikki.linnakangas 5545 GIC 1142 : newTLI = endOfRecoveryInfo->lastRecTLI;
3698 heikki.linnakangas 5546 CBC 1142 : if (ArchiveRecoveryRequested)
5547 : {
515 rhaas 5548 GIC 39 : newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
6836 tgl 5549 39 : ereport(LOG,
5550 : (errmsg("selected new timeline ID: %u", newTLI)));
5551 :
5552 : /*
417 heikki.linnakangas 5553 ECB : * Make a writable copy of the last WAL segment. (Note that we also
5554 : * have a copy of the last block of the old WAL in
5555 : * endOfRecovery->lastPage; we will use that below.)
5556 : */
417 heikki.linnakangas 5557 GIC 39 : XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
5558 :
5559 : /*
5560 : * Remove the signal files out of the way, so that we don't
417 heikki.linnakangas 5561 ECB : * accidentally re-enter archive recovery mode in a subsequent crash.
5562 : */
417 heikki.linnakangas 5563 GIC 39 : if (endOfRecoveryInfo->standby_signal_file_found)
5564 36 : durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5565 :
5566 39 : if (endOfRecoveryInfo->recovery_signal_file_found)
5567 3 : durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5568 :
5569 : /*
5570 : * Write the timeline history file, and have it archived. After this
5571 : * point (or rather, as soon as the file is archived), the timeline
1735 michael 5572 ECB : * will appear as "taken" in the WAL archive and to any standby
5573 : * servers. If we crash before actually switching to the new
5574 : * timeline, standby servers will nevertheless think that we switched
5575 : * to the new timeline, and will try to connect to the new timeline.
5576 : * To minimize the window for that, try to do as little as possible
5577 : * between here and writing the end-of-recovery record.
5578 : */
515 rhaas 5579 CBC 39 : writeTimeLineHistory(newTLI, recoveryTargetTLI,
417 heikki.linnakangas 5580 ECB : EndOfLog, endOfRecoveryInfo->recoveryStopReason);
1285 michael 5581 :
417 heikki.linnakangas 5582 GIC 39 : ereport(LOG,
417 heikki.linnakangas 5583 ECB : (errmsg("archive recovery complete")));
5584 : }
6836 tgl 5585 :
5586 : /* Save the selected TimeLineID in shared memory, too */
515 rhaas 5587 GIC 1142 : XLogCtl->InsertTimeLineID = newTLI;
417 heikki.linnakangas 5588 1142 : XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
5589 :
5590 : /*
5591 : * Actually, if WAL ended in an incomplete record, skip the parts that
557 alvherre 5592 ECB : * made it through and start writing after the portion that persisted.
5593 : * (It's critical to first write an OVERWRITE_CONTRECORD message, which
5594 : * we'll do as soon as we're open for writing new WAL.)
5595 : */
557 alvherre 5596 CBC 1142 : if (!XLogRecPtrIsInvalid(missingContrecPtr))
5597 : {
5598 : /*
5599 : * We should only have a missingContrecPtr if we're not switching to
5600 : * a new timeline. When a timeline switch occurs, WAL is copied from
223 rhaas 5601 ECB : * the old timeline to the new only up to the end of the last complete
5602 : * record, so there can't be an incomplete WAL record that we need to
5603 : * disregard.
5604 : */
223 rhaas 5605 CBC 1 : Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
557 alvherre 5606 GIC 1 : Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
5607 1 : EndOfLog = missingContrecPtr;
5608 : }
5609 :
5610 : /*
2158 peter_e 5611 ECB : * Prepare to write WAL starting at EndOfLog location, and init xlog
5612 : * buffer cache using the block containing the last record from the
5613 : * previous incarnation.
5614 : */
8198 vadim4o 5615 GIC 1142 : Insert = &XLogCtl->Insert;
417 heikki.linnakangas 5616 1142 : Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
3553 5617 1142 : Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
5618 :
5619 : /*
5620 : * Tricky point here: lastPage contains the *last* block that the LastRec
5621 : * record spans, not the one it starts in. The last block is indeed the
5622 : * one we want to use.
5623 : */
5624 1142 : if (EndOfLog % XLOG_BLCKSZ != 0)
5625 : {
5626 : char *page;
5627 : int len;
5628 : int firstIdx;
5629 :
5630 1128 : firstIdx = XLogRecPtrToBufIdx(EndOfLog);
417 heikki.linnakangas 5631 CBC 1128 : len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
5632 1128 : Assert(len < XLOG_BLCKSZ);
8198 vadim4o 5633 ECB :
5634 : /* Copy the valid part of the last block, and zero the rest */
3553 heikki.linnakangas 5635 GIC 1128 : page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
417 5636 1128 : memcpy(page, endOfRecoveryInfo->lastPage, len);
3553 5637 1128 : memset(page + len, 0, XLOG_BLCKSZ - len);
5638 :
417 5639 1128 : XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
5640 1128 : XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
7206 tgl 5641 EUB : }
5642 : else
5643 : {
5644 : /*
5645 : * There is no partial block to copy. Just set InitializedUpTo, and
5646 : * let the first attempt to insert a log record to initialize the next
5647 : * buffer.
5648 : */
3553 heikki.linnakangas 5649 GIC 14 : XLogCtl->InitializedUpTo = EndOfLog;
5650 : }
5651 :
5652 1142 : LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5653 :
5654 1142 : XLogCtl->LogwrtResult = LogwrtResult;
5655 :
5656 1142 : XLogCtl->LogwrtRqst.Write = EndOfLog;
5657 1142 : XLogCtl->LogwrtRqst.Flush = EndOfLog;
5658 :
5659 : /*
8062 tgl 5660 ECB : * Preallocate additional log files, if wanted.
5661 : */
515 rhaas 5662 GIC 1142 : PreallocXlogFiles(EndOfLog, newTLI);
5663 :
5664 : /*
5665 : * Okay, we're officially UP.
5666 : */
8198 vadim4o 5667 1142 : InRecovery = false;
8586 vadim4o 5668 ECB :
5669 : /* start the archive_timeout timer and LSN running */
3553 heikki.linnakangas 5670 GIC 1142 : XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2299 andres 5671 1142 : XLogCtl->lastSegSwitchLSN = EndOfLog;
5672 :
5673 : /* also initialize latestCompletedXid, to nextXid - 1 */
4080 tgl 5674 CBC 1142 : LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
971 andres 5675 GIC 1142 : ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5676 1142 : FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
4080 tgl 5677 1142 : LWLockRelease(ProcArrayLock);
5678 :
5679 : /*
5680 : * Start up subtrans, if not already done for hot standby. (commit
5681 : * timestamps are started below, if necessary.)
5682 : */
4859 simon 5683 1142 : if (standbyState == STANDBY_DISABLED)
5684 1103 : StartupSUBTRANS(oldestActiveXID);
5685 :
5686 : /*
5687 : * Perform end of recovery actions for any SLRUs that need it.
5688 : */
4176 5689 1142 : TrimCLOG();
3418 alvherre 5690 CBC 1142 : TrimMultiXact();
4176 simon 5691 ECB :
5692 : /* Reload shared-memory state for prepared transactions */
6505 tgl 5693 CBC 1142 : RecoverPreparedTransactions();
6505 tgl 5694 ECB :
5695 : /* Shut down xlogreader */
417 heikki.linnakangas 5696 GIC 1142 : ShutdownWalRecovery();
5697 :
5698 : /* Enable WAL writes for this backend only. */
542 rhaas 5699 1142 : LocalSetXLogInsertAllowed();
5700 :
5701 : /* If necessary, write overwrite-contrecord before doing anything else */
542 rhaas 5702 CBC 1142 : if (!XLogRecPtrIsInvalid(abortedRecPtr))
5703 : {
542 rhaas 5704 GIC 1 : Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
417 heikki.linnakangas 5705 1 : CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
5706 : }
5707 :
542 rhaas 5708 ECB : /*
5709 : * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
5710 : * record before resource manager writes cleanup WAL records or checkpoint
5711 : * record is written.
5712 : */
542 rhaas 5713 GIC 1142 : Insert->fullPageWrites = lastFullPageWrites;
5714 1142 : UpdateFullPageWrites();
5715 :
5716 : /*
5717 : * Emit checkpoint or end-of-recovery record in XLOG, if required.
5718 : */
417 heikki.linnakangas 5719 1142 : if (performedWalRecovery)
542 rhaas 5720 117 : promoted = PerformRecoveryXLogAction();
5721 :
5722 : /*
5723 : * If any of the critical GUCs have changed, log them before we allow
4729 heikki.linnakangas 5724 ECB : * backends to write WAL.
5725 : */
4729 heikki.linnakangas 5726 GIC 1142 : XLogReportParameters();
4729 heikki.linnakangas 5727 ECB :
5728 : /* If this is archive recovery, perform post-recovery cleanup actions. */
531 rhaas 5729 GIC 1142 : if (ArchiveRecoveryRequested)
515 5730 39 : CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
5731 :
3049 alvherre 5732 ECB : /*
2878 bruce 5733 : * Local WAL inserts enabled, so it's time to finish initialization of
5734 : * commit timestamp.
5735 : */
3049 alvherre 5736 GIC 1142 : CompleteCommitTsInitialization();
5737 :
5738 : /*
5739 : * All done with end-of-recovery actions.
5740 : *
2448 peter_e 5741 ECB : * Now allow backends to write WAL and update the control file status in
5742 : * consequence. SharedRecoveryState, that controls if backends can write
5743 : * WAL, is updated while holding ControlFileLock to prevent other backends
5744 : * to look at an inconsistent state of the control file in shared memory.
5745 : * There is still a small window during which backends can write WAL and
5746 : * the control file is still referring to a system not in DB_IN_PRODUCTION
5747 : * state while looking at the on-disk control file.
5748 : *
5749 : * Also, we use info_lck to update SharedRecoveryState to ensure that
792 michael 5750 : * there are no race conditions concerning visibility of other recent
5751 : * updates to shared memory.
5163 heikki.linnakangas 5752 : */
2448 peter_e 5753 GIC 1142 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5754 1142 : ControlFile->state = DB_IN_PRODUCTION;
5755 :
3121 andres 5756 1142 : SpinLockAcquire(&XLogCtl->info_lck);
1080 michael 5757 1142 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
3121 andres 5758 1142 : SpinLockRelease(&XLogCtl->info_lck);
5759 :
2448 peter_e 5760 CBC 1142 : UpdateControlFile();
5761 1142 : LWLockRelease(ControlFileLock);
2448 peter_e 5762 ECB :
5763 : /*
5764 : * Shutdown the recovery environment. This must occur after
5765 : * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
5766 : * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
5767 : * any session building a snapshot will not rely on KnownAssignedXids as
5768 : * RecoveryInProgress() would return false at this stage. This is
552 michael 5769 : * particularly critical for prepared 2PC transactions, that would still
5770 : * need to be included in snapshots once recovery has ended.
5771 : */
552 michael 5772 GIC 1142 : if (standbyState != STANDBY_DISABLED)
5773 39 : ShutdownRecoveryTransactionEnvironment();
5774 :
3769 heikki.linnakangas 5775 ECB : /*
3602 bruce 5776 : * If there were cascading standby servers connected to us, nudge any wal
5777 : * sender processes to notice that we've been promoted.
5778 : */
1 andres 5779 GNC 1142 : WalSndWakeup(true, true);
3709 heikki.linnakangas 5780 ECB :
5781 : /*
697 tgl 5782 : * If this was a promotion, request an (online) checkpoint now. This isn't
5783 : * required for consistency, but the last restartpoint might be far back,
5784 : * and in case of a crash, recovering from it might take a longer than is
5785 : * appropriate now that we're not in standby mode anymore.
5786 : */
984 fujii 5787 GIC 1142 : if (promoted)
3610 simon 5788 36 : RequestCheckpoint(CHECKPOINT_FORCE);
5163 heikki.linnakangas 5789 1142 : }
5790 :
5791 : /*
5792 : * Callback from PerformWalRecovery(), called when we switch from crash
5793 : * recovery to archive recovery mode. Updates the control file accordingly.
4744 heikki.linnakangas 5794 ECB : */
5795 : void
417 heikki.linnakangas 5796 GIC 2 : SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
4744 heikki.linnakangas 5797 ECB : {
5798 : /* initialize minRecoveryPoint to this record */
417 heikki.linnakangas 5799 CBC 2 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
417 heikki.linnakangas 5800 GIC 2 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
417 heikki.linnakangas 5801 CBC 2 : if (ControlFile->minRecoveryPoint < EndRecPtr)
417 heikki.linnakangas 5802 ECB : {
417 heikki.linnakangas 5803 GIC 2 : ControlFile->minRecoveryPoint = EndRecPtr;
5804 2 : ControlFile->minRecoveryPointTLI = replayTLI;
5805 : }
5806 : /* update local copy */
417 heikki.linnakangas 5807 CBC 2 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
417 heikki.linnakangas 5808 GIC 2 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5809 :
5810 : /*
5811 : * The startup process can update its local copy of minRecoveryPoint from
417 heikki.linnakangas 5812 ECB : * this point.
5813 : */
417 heikki.linnakangas 5814 GIC 2 : updateMinRecoveryPoint = true;
1739 michael 5815 ECB :
417 heikki.linnakangas 5816 CBC 2 : UpdateControlFile();
5817 :
5818 : /*
417 heikki.linnakangas 5819 ECB : * We update SharedRecoveryState while holding the lock on ControlFileLock
5820 : * so both states are consistent in shared memory.
3763 5821 : */
417 heikki.linnakangas 5822 CBC 2 : SpinLockAcquire(&XLogCtl->info_lck);
417 heikki.linnakangas 5823 GIC 2 : XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
5824 2 : SpinLockRelease(&XLogCtl->info_lck);
5825 :
5826 2 : LWLockRelease(ControlFileLock);
5827 2 : }
3763 heikki.linnakangas 5828 ECB :
417 5829 : /*
5830 : * Callback from PerformWalRecovery(), called when we reach the end of backup.
5831 : * Updates the control file accordingly.
5832 : */
5833 : void
417 heikki.linnakangas 5834 CBC 51 : ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
417 heikki.linnakangas 5835 ECB : {
5836 : /*
5837 : * We have reached the end of base backup, as indicated by pg_control. The
5838 : * data on disk is now consistent (unless minRecovery point is further
5839 : * ahead, which can happen if we crashed during previous recovery). Reset
5840 : * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
5841 : * make sure we don't allow starting up at an earlier point even if
5842 : * recovery is stopped and restarted soon after this.
5843 : */
417 heikki.linnakangas 5844 CBC 51 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5845 :
417 heikki.linnakangas 5846 GIC 51 : if (ControlFile->minRecoveryPoint < EndRecPtr)
4744 heikki.linnakangas 5847 ECB : {
417 heikki.linnakangas 5848 GIC 49 : ControlFile->minRecoveryPoint = EndRecPtr;
417 heikki.linnakangas 5849 CBC 49 : ControlFile->minRecoveryPointTLI = tli;
417 heikki.linnakangas 5850 ECB : }
5851 :
417 heikki.linnakangas 5852 GIC 51 : ControlFile->backupStartPoint = InvalidXLogRecPtr;
5853 51 : ControlFile->backupEndPoint = InvalidXLogRecPtr;
5854 51 : ControlFile->backupEndRequired = false;
5855 51 : UpdateControlFile();
5856 :
5857 51 : LWLockRelease(ControlFileLock);
4744 heikki.linnakangas 5858 CBC 51 : }
4744 heikki.linnakangas 5859 ECB :
5860 : /*
5861 : * Perform whatever XLOG actions are necessary at end of REDO.
5862 : *
5863 : * The goal here is to make sure that we'll be able to recover properly if
543 rhaas 5864 : * we crash again. If we choose to write a checkpoint, we'll write a shutdown
5865 : * checkpoint rather than an on-line one. This is not particularly critical,
5866 : * but since we may be assigning a new TLI, using a shutdown checkpoint allows
5867 : * us to have the rule that TLI only changes in shutdown checkpoints, which
5868 : * allows some extra error checking in xlog_redo.
5869 : */
5870 : static bool
543 rhaas 5871 CBC 117 : PerformRecoveryXLogAction(void)
5872 : {
543 rhaas 5873 GIC 117 : bool promoted = false;
543 rhaas 5874 ECB :
5875 : /*
5876 : * Perform a checkpoint to update all our recovery activity to disk.
5877 : *
5878 : * Note that we write a shutdown checkpoint rather than an on-line one.
5879 : * This is not particularly critical, but since we may be assigning a new
5880 : * TLI, using a shutdown checkpoint allows us to have the rule that TLI
417 heikki.linnakangas 5881 : * only changes in shutdown checkpoints, which allows some extra error
5882 : * checking in xlog_redo.
5883 : *
5884 : * In promotion, only create a lightweight end-of-recovery record instead
5885 : * of a full checkpoint. A checkpoint is requested later, after we're
5886 : * fully out of recovery mode and already accepting queries.
5887 : */
543 rhaas 5888 GIC 156 : if (ArchiveRecoveryRequested && IsUnderPostmaster &&
417 heikki.linnakangas 5889 39 : PromoteIsTriggered())
5890 : {
543 rhaas 5891 36 : promoted = true;
5892 :
5893 : /*
5894 : * Insert a special WAL record to mark the end of recovery, since we
5895 : * aren't doing a checkpoint. That means that the checkpointer process
5896 : * may likely be in the middle of a time-smoothed restartpoint and
5897 : * could continue to be for minutes after this. That sounds strange,
417 heikki.linnakangas 5898 ECB : * but the effect is roughly the same and it would be stranger to try
5899 : * to come out of the restartpoint and then checkpoint. We request a
5900 : * checkpoint later anyway, just for safety.
543 rhaas 5901 : */
543 rhaas 5902 CBC 36 : CreateEndOfRecoveryRecord();
543 rhaas 5903 ECB : }
5904 : else
5905 : {
543 rhaas 5906 CBC 81 : RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5907 : CHECKPOINT_IMMEDIATE |
5908 : CHECKPOINT_WAIT);
5909 : }
5910 :
543 rhaas 5911 GIC 117 : return promoted;
5912 : }
5913 :
5914 : /*
5915 : * Is the system still in recovery?
5916 : *
5035 tgl 5917 ECB : * Unlike testing InRecovery, this works in any process that's connected to
5918 : * shared memory.
5919 : */
5920 : bool
5163 heikki.linnakangas 5921 GIC 81142281 : RecoveryInProgress(void)
5922 : {
5923 : /*
4790 bruce 5924 ECB : * We check shared state each time only until we leave recovery mode. We
5925 : * can't re-enter recovery, so there's no need to keep checking after the
5926 : * shared variable has once been seen false.
5927 : */
5163 heikki.linnakangas 5928 GIC 81142281 : if (!LocalRecoveryInProgress)
5929 77601222 : return false;
5930 : else
5931 : {
3425 heikki.linnakangas 5932 ECB : /*
5933 : * use volatile pointer to make sure we make a fresh read of the
5934 : * shared variable.
5935 : */
5163 heikki.linnakangas 5936 GIC 3541059 : volatile XLogCtlData *xlogctl = XLogCtl;
5937 :
1080 michael 5938 3541059 : LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
5939 :
5940 : /*
3425 heikki.linnakangas 5941 ECB : * Note: We don't need a memory barrier when we're still in recovery.
5942 : * We might exit recovery immediately after return, so the caller
5943 : * can't rely on 'true' meaning that we're still in recovery anyway.
5944 : */
5163 5945 :
5163 heikki.linnakangas 5946 CBC 3541059 : return LocalRecoveryInProgress;
5947 : }
8062 tgl 5948 ECB : }
5949 :
5950 : /*
5951 : * Returns current recovery state from shared memory.
1080 michael 5952 : *
5953 : * This returned state is kept consistent with the contents of the control
5954 : * file. See details about the possible values of RecoveryState in xlog.h.
5955 : */
5956 : RecoveryState
1080 michael 5957 GIC 25 : GetRecoveryState(void)
5958 : {
1080 michael 5959 ECB : RecoveryState retval;
5960 :
1080 michael 5961 CBC 25 : SpinLockAcquire(&XLogCtl->info_lck);
1080 michael 5962 GIC 25 : retval = XLogCtl->SharedRecoveryState;
5963 25 : SpinLockRelease(&XLogCtl->info_lck);
5964 :
5965 25 : return retval;
5966 : }
1080 michael 5967 ECB :
5035 tgl 5968 : /*
5969 : * Is this process allowed to insert new WAL records?
5970 : *
5971 : * Ordinarily this is essentially equivalent to !RecoveryInProgress().
5972 : * But we also have provisions for forcing the result "true" or "false"
5973 : * within specific processes regardless of the global state.
5974 : */
5975 : bool
5035 tgl 5976 GIC 43544110 : XLogInsertAllowed(void)
5977 : {
5978 : /*
4790 bruce 5979 ECB : * If value is "unconditionally true" or "unconditionally false", just
5980 : * return it. This provides the normal fast path once recovery is known
5981 : * done.
5982 : */
5035 tgl 5983 GIC 43544110 : if (LocalXLogInsertAllowed >= 0)
5984 43451207 : return (bool) LocalXLogInsertAllowed;
5985 :
5986 : /*
5987 : * Else, must check to see if we're still in recovery.
5988 : */
5035 tgl 5989 CBC 92903 : if (RecoveryInProgress())
5035 tgl 5990 GIC 87067 : return false;
5035 tgl 5991 ECB :
5992 : /*
4790 bruce 5993 : * On exit from recovery, reset to "unconditionally true", since there is
5994 : * no need to keep checking.
5995 : */
5035 tgl 5996 GIC 5836 : LocalXLogInsertAllowed = 1;
5035 tgl 5997 CBC 5836 : return true;
5035 tgl 5998 ECB : }
5999 :
6000 : /*
6001 : * Make XLogInsertAllowed() return true in the current process only.
4992 6002 : *
6003 : * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6004 : * and even call LocalSetXLogInsertAllowed() again after that.
6005 : *
6006 : * Returns the previous value of LocalXLogInsertAllowed.
6007 : */
6008 : static int
5035 tgl 6009 GIC 1169 : LocalSetXLogInsertAllowed(void)
6010 : {
417 heikki.linnakangas 6011 1169 : int oldXLogAllowed = LocalXLogInsertAllowed;
6012 :
5035 tgl 6013 1169 : LocalXLogInsertAllowed = 1;
6014 :
531 rhaas 6015 1169 : return oldXLogAllowed;
5035 tgl 6016 ECB : }
6017 :
7695 6018 : /*
6019 : * Return the current Redo pointer from shared memory.
6020 : *
6021 : * As a side-effect, the local RedoRecPtr copy is updated.
6022 : */
6023 : XLogRecPtr
8137 vadim4o 6024 GIC 152956 : GetRedoRecPtr(void)
6025 : {
6026 : XLogRecPtr ptr;
6027 :
6028 : /*
6029 : * The possibly not up-to-date copy in XlogCtl is enough. Even if we
6030 : * grabbed a WAL insertion lock to read the authoritative value in
6031 : * Insert->RedoRecPtr, someone might update it just after we've released
6032 : * the lock.
3562 heikki.linnakangas 6033 ECB : */
3121 andres 6034 CBC 152956 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 6035 GIC 152956 : ptr = XLogCtl->RedoRecPtr;
3121 andres 6036 CBC 152956 : SpinLockRelease(&XLogCtl->info_lck);
6037 :
3562 heikki.linnakangas 6038 GIC 152956 : if (RedoRecPtr < ptr)
6039 742 : RedoRecPtr = ptr;
6040 :
7695 tgl 6041 152956 : return RedoRecPtr;
6042 : }
6043 :
6044 : /*
6045 : * Return information needed to decide whether a modified block needs a
6046 : * full-page image to be included in the WAL record.
3076 heikki.linnakangas 6047 ECB : *
6048 : * The returned values are cached copies from backend-private memory, and
6049 : * possibly out-of-date or, indeed, uninitialized, in which case they will
6050 : * be InvalidXLogRecPtr and false, respectively. XLogInsertRecord will
482 rhaas 6051 : * re-check them against up-to-date values, while holding the WAL insert lock.
6052 : */
6053 : void
3076 heikki.linnakangas 6054 GIC 19592404 : GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
6055 : {
3076 heikki.linnakangas 6056 CBC 19592404 : *RedoRecPtr_p = RedoRecPtr;
3076 heikki.linnakangas 6057 GIC 19592404 : *doPageWrites_p = doPageWrites;
6058 19592404 : }
6059 :
6060 : /*
6061 : * GetInsertRecPtr -- Returns the current insert position.
6062 : *
6063 : * NOTE: The value *actually* returned is the position of the last full
6064 : * xlog page. It lags behind the real insert position by at most 1 page.
6065 : * For that, we don't need to scan through WAL insertion locks, and an
3562 heikki.linnakangas 6066 ECB : * approximation is enough for the current usage of this function.
6067 : */
6068 : XLogRecPtr
5764 tgl 6069 GIC 2424 : GetInsertRecPtr(void)
6070 : {
6071 : XLogRecPtr recptr;
6072 :
3121 andres 6073 CBC 2424 : SpinLockAcquire(&XLogCtl->info_lck);
6074 2424 : recptr = XLogCtl->LogwrtRqst.Write;
3121 andres 6075 GIC 2424 : SpinLockRelease(&XLogCtl->info_lck);
6076 :
5764 tgl 6077 2424 : return recptr;
6078 : }
6079 :
6080 : /*
4679 tgl 6081 ECB : * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6082 : * position known to be fsync'd to disk. This should only be used on a
515 rhaas 6083 : * system that is known not to be in recovery.
6084 : */
6085 : XLogRecPtr
520 rhaas 6086 GIC 147978 : GetFlushRecPtr(TimeLineID *insertTLI)
6087 : {
515 6088 147978 : Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6089 :
3121 andres 6090 147978 : SpinLockAcquire(&XLogCtl->info_lck);
2644 simon 6091 CBC 147978 : LogwrtResult = XLogCtl->LogwrtResult;
3121 andres 6092 GIC 147978 : SpinLockRelease(&XLogCtl->info_lck);
6093 :
6094 : /*
6095 : * If we're writing and flushing WAL, the time line can't be changing, so
6096 : * no lock is required.
6097 : */
520 rhaas 6098 147978 : if (insertTLI)
515 6099 24724 : *insertTLI = XLogCtl->InsertTimeLineID;
6100 :
2644 simon 6101 147978 : return LogwrtResult.Flush;
4832 heikki.linnakangas 6102 ECB : }
6103 :
6104 : /*
6105 : * GetWALInsertionTimeLine -- Returns the current timeline of a system that
520 rhaas 6106 : * is not in recovery.
6107 : */
6108 : TimeLineID
520 rhaas 6109 GIC 10349 : GetWALInsertionTimeLine(void)
520 rhaas 6110 ECB : {
520 rhaas 6111 GIC 10349 : Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6112 :
6113 : /* Since the value can't be changing, no lock is required. */
515 6114 10349 : return XLogCtl->InsertTimeLineID;
6115 : }
6116 :
6117 : /*
6118 : * GetLastImportantRecPtr -- Returns the LSN of the last important record
6119 : * inserted. All records not explicitly marked as unimportant are considered
6120 : * important.
2299 andres 6121 ECB : *
6122 : * The LSN is determined by computing the maximum of
6123 : * WALInsertLocks[i].lastImportantAt.
6124 : */
6125 : XLogRecPtr
2299 andres 6126 GIC 2386 : GetLastImportantRecPtr(void)
6127 : {
2299 andres 6128 CBC 2386 : XLogRecPtr res = InvalidXLogRecPtr;
2299 andres 6129 ECB : int i;
6130 :
2299 andres 6131 GIC 21474 : for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
6132 : {
6133 : XLogRecPtr last_important;
2299 andres 6134 ECB :
6135 : /*
6136 : * Need to take a lock to prevent torn reads of the LSN, which are
6137 : * possible on some of the supported platforms. WAL insert locks only
6138 : * support exclusive mode, so we have to use that.
6139 : */
2299 andres 6140 GIC 19088 : LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
2299 andres 6141 CBC 19088 : last_important = WALInsertLocks[i].l.lastImportantAt;
6142 19088 : LWLockRelease(&WALInsertLocks[i].l.lock);
6143 :
2299 andres 6144 GIC 19088 : if (res < last_important)
6145 2619 : res = last_important;
6146 : }
6147 :
6148 2386 : return res;
6149 : }
6150 :
6151 : /*
6152 : * Get the time and LSN of the last xlog segment switch
6153 : */
5530 tgl 6154 ECB : pg_time_t
2299 andres 6155 UIC 0 : GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
6079 tgl 6156 ECB : {
6157 : pg_time_t result;
6158 :
6159 : /* Need WALWriteLock, but shared lock is sufficient */
6079 tgl 6160 LBC 0 : LWLockAcquire(WALWriteLock, LW_SHARED);
3553 heikki.linnakangas 6161 UIC 0 : result = XLogCtl->lastSegSwitchTime;
2299 andres 6162 0 : *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
6079 tgl 6163 0 : LWLockRelease(WALWriteLock);
6164 :
6165 0 : return result;
6166 : }
6167 :
6168 : /*
8062 tgl 6169 ECB : * This must be called ONCE during postmaster or standalone-backend shutdown
6170 : */
6171 : void
7058 peter_e 6172 GIC 971 : ShutdownXLOG(int code, Datum arg)
6173 : {
6174 : /*
6175 : * We should have an aux process resource owner to use, and we should not
6176 : * be in a transaction that's installed some other resowner.
6177 : */
1726 tgl 6178 971 : Assert(AuxProcessResourceOwner != NULL);
1726 tgl 6179 CBC 971 : Assert(CurrentResourceOwner == NULL ||
1726 tgl 6180 ECB : CurrentResourceOwner == AuxProcessResourceOwner);
1726 tgl 6181 CBC 971 : CurrentResourceOwner = AuxProcessResourceOwner;
6182 :
3587 tgl 6183 ECB : /* Don't be chatty in standalone mode */
3587 tgl 6184 CBC 971 : ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6185 : (errmsg("shutting down")));
8586 vadim4o 6186 ECB :
6187 : /*
6188 : * Signal walsenders to move to stopping state.
6189 : */
2134 andres 6190 GIC 971 : WalSndInitStopping();
6191 :
6192 : /*
6193 : * Wait for WAL senders to be in stopping state. This prevents commands
6194 : * from writing new WAL.
6195 : */
6196 971 : WalSndWaitStopping();
6197 :
5163 heikki.linnakangas 6198 971 : if (RecoveryInProgress())
5163 heikki.linnakangas 6199 CBC 31 : CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6200 : else
5064 heikki.linnakangas 6201 ECB : {
6202 : /*
6203 : * If archiving is enabled, rotate the last XLOG file so that all the
6204 : * remaining records are archived (postmaster wakes up the archiver
6205 : * process one more time at the end of shutdown). The checkpoint
6206 : * record will go to the next XLOG file and won't be archived (yet).
6207 : */
430 rhaas 6208 GIC 940 : if (XLogArchivingActive())
2299 andres 6209 9 : RequestXLogSwitch(false);
6210 :
5163 heikki.linnakangas 6211 940 : CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6212 : }
8586 vadim4o 6213 971 : }
8586 vadim4o 6214 ECB :
6215 : /*
6216 : * Log start of a checkpoint.
6217 : */
5762 tgl 6218 : static void
5163 heikki.linnakangas 6219 CBC 541 : LogCheckpointStart(int flags, bool restartpoint)
5762 tgl 6220 ECB : {
856 peter 6221 GIC 541 : if (restartpoint)
856 peter 6222 CBC 28 : ereport(LOG,
6223 : /* translator: the placeholders show checkpoint options */
6224 : (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
6225 : (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6226 : (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6227 : (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6228 : (flags & CHECKPOINT_FORCE) ? " force" : "",
6229 : (flags & CHECKPOINT_WAIT) ? " wait" : "",
6230 : (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
856 peter 6231 ECB : (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6232 : (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6233 : else
856 peter 6234 GIC 513 : ereport(LOG,
697 tgl 6235 ECB : /* translator: the placeholders show checkpoint options */
856 peter 6236 : (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
6237 : (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6238 : (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6239 : (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6240 : (flags & CHECKPOINT_FORCE) ? " force" : "",
6241 : (flags & CHECKPOINT_WAIT) ? " wait" : "",
6242 : (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6243 : (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6244 : (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
5762 tgl 6245 GIC 541 : }
5762 tgl 6246 ECB :
6247 : /*
6248 : * Log end of a checkpoint.
6249 : */
6250 : static void
5163 heikki.linnakangas 6251 GIC 2363 : LogCheckpointEnd(bool restartpoint)
6252 : {
6253 : long write_msecs,
880 tgl 6254 ECB : sync_msecs,
6255 : total_msecs,
6256 : longest_msecs,
6257 : average_msecs;
6258 : uint64 average_sync_time;
5762 6259 :
5762 tgl 6260 GIC 2363 : CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6261 :
880 6262 2363 : write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
6263 : CheckpointStats.ckpt_sync_t);
6264 :
6265 2363 : sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
6266 : CheckpointStats.ckpt_sync_end_t);
6267 :
6268 : /* Accumulate checkpoint timing summary data, in milliseconds. */
368 andres 6269 2363 : PendingCheckpointerStats.checkpoint_write_time += write_msecs;
6270 2363 : PendingCheckpointerStats.checkpoint_sync_time += sync_msecs;
4021 rhaas 6271 ECB :
6272 : /*
6273 : * All of the published timing statistics are accounted for. Only
6274 : * continue if a log message is to be written.
6275 : */
4021 rhaas 6276 CBC 2363 : if (!log_checkpoints)
4021 rhaas 6277 GIC 1822 : return;
6278 :
880 tgl 6279 541 : total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
6280 : CheckpointStats.ckpt_end_t);
6281 :
6282 : /*
6283 : * Timing values returned from CheckpointStats are in microseconds.
6284 : * Convert to milliseconds for consistent printing.
4499 rhaas 6285 ECB : */
880 tgl 6286 CBC 541 : longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
4499 rhaas 6287 ECB :
4499 rhaas 6288 GIC 541 : average_sync_time = 0;
4382 bruce 6289 CBC 541 : if (CheckpointStats.ckpt_sync_rels > 0)
4499 rhaas 6290 LBC 0 : average_sync_time = CheckpointStats.ckpt_agg_sync_time /
4499 rhaas 6291 UIC 0 : CheckpointStats.ckpt_sync_rels;
880 tgl 6292 GIC 541 : average_msecs = (long) ((average_sync_time + 999) / 1000);
4499 rhaas 6293 ECB :
6294 : /*
6295 : * ControlFileLock is not required to see ControlFile->checkPoint and
6296 : * ->checkPointCopy here as we are the only updator of those variables at
6297 : * this moment.
6298 : */
856 peter 6299 GIC 541 : if (restartpoint)
6300 28 : ereport(LOG,
6301 : (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
6302 : "%d WAL file(s) added, %d removed, %d recycled; "
6303 : "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6304 : "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
6305 : "distance=%d kB, estimate=%d kB; "
6306 : "lsn=%X/%X, redo lsn=%X/%X",
6307 : CheckpointStats.ckpt_bufs_written,
6308 : (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6309 : CheckpointStats.ckpt_segs_added,
6310 : CheckpointStats.ckpt_segs_removed,
856 peter 6311 EUB : CheckpointStats.ckpt_segs_recycled,
6312 : write_msecs / 1000, (int) (write_msecs % 1000),
6313 : sync_msecs / 1000, (int) (sync_msecs % 1000),
6314 : total_msecs / 1000, (int) (total_msecs % 1000),
6315 : CheckpointStats.ckpt_sync_rels,
6316 : longest_msecs / 1000, (int) (longest_msecs % 1000),
6317 : average_msecs / 1000, (int) (average_msecs % 1000),
6318 : (int) (PrevCheckPointDistance / 1024.0),
6319 : (int) (CheckPointDistanceEstimate / 1024.0),
6320 : LSN_FORMAT_ARGS(ControlFile->checkPoint),
6321 : LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6322 : else
856 peter 6323 GIC 513 : ereport(LOG,
6324 : (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
856 peter 6325 ECB : "%d WAL file(s) added, %d removed, %d recycled; "
6326 : "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
6327 : "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
6328 : "distance=%d kB, estimate=%d kB; "
6329 : "lsn=%X/%X, redo lsn=%X/%X",
6330 : CheckpointStats.ckpt_bufs_written,
6331 : (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6332 : CheckpointStats.ckpt_segs_added,
6333 : CheckpointStats.ckpt_segs_removed,
6334 : CheckpointStats.ckpt_segs_recycled,
6335 : write_msecs / 1000, (int) (write_msecs % 1000),
6336 : sync_msecs / 1000, (int) (sync_msecs % 1000),
6337 : total_msecs / 1000, (int) (total_msecs % 1000),
6338 : CheckpointStats.ckpt_sync_rels,
6339 : longest_msecs / 1000, (int) (longest_msecs % 1000),
6340 : average_msecs / 1000, (int) (average_msecs % 1000),
6341 : (int) (PrevCheckPointDistance / 1024.0),
6342 : (int) (CheckPointDistanceEstimate / 1024.0),
6343 : LSN_FORMAT_ARGS(ControlFile->checkPoint),
6344 : LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6345 : }
2967 heikki.linnakangas 6346 :
6347 : /*
6348 : * Update the estimate of distance between checkpoints.
6349 : *
6350 : * The estimate is used to calculate the number of WAL segments to keep
6351 : * preallocated, see XLOGfileslop().
6352 : */
6353 : static void
2967 heikki.linnakangas 6354 CBC 2363 : UpdateCheckPointDistanceEstimate(uint64 nbytes)
2967 heikki.linnakangas 6355 ECB : {
6356 : /*
6357 : * To estimate the number of segments consumed between checkpoints, keep a
6358 : * moving average of the amount of WAL generated in previous checkpoint
6359 : * cycles. However, if the load is bursty, with quiet periods and busy
6360 : * periods, we want to cater for the peak load. So instead of a plain
6361 : * moving average, let the average decline slowly if the previous cycle
6362 : * used less WAL than estimated, but bump it up immediately if it used
6363 : * more.
6364 : *
6365 : * When checkpoints are triggered by max_wal_size, this should converge to
6366 : * CheckpointSegments * wal_segment_size,
6367 : *
6368 : * Note: This doesn't pay any attention to what caused the checkpoint.
6369 : * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
6370 : * starting a base backup, are counted the same as those created
6371 : * automatically. The slow-decline will largely mask them out, if they are
6372 : * not frequent. If they are frequent, it seems reasonable to count them
6373 : * in as any others; if you issue a manual checkpoint every 5 minutes and
6374 : * never let a timed checkpoint happen, it makes sense to base the
6375 : * preallocation on that 5 minute interval rather than whatever
6376 : * checkpoint_timeout is set to.
6377 : */
2967 heikki.linnakangas 6378 CBC 2363 : PrevCheckPointDistance = nbytes;
2967 heikki.linnakangas 6379 GIC 2363 : if (CheckPointDistanceEstimate < nbytes)
6380 1015 : CheckPointDistanceEstimate = nbytes;
6381 : else
6382 1348 : CheckPointDistanceEstimate =
6383 1348 : (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
5762 tgl 6384 2363 : }
6385 :
6386 : /*
6387 : * Update the ps display for a process running a checkpoint. Note that
6388 : * this routine should not do any allocations so as it can be called
6389 : * from a critical section.
846 michael 6390 ECB : */
6391 : static void
846 michael 6392 GIC 4726 : update_checkpoint_display(int flags, bool restartpoint, bool reset)
6393 : {
6394 : /*
6395 : * The status is reported only for end-of-recovery and shutdown
6396 : * checkpoints or shutdown restartpoints. Updating the ps display is
6397 : * useful in those situations as it may not be possible to rely on
6398 : * pg_stat_activity to see the status of the checkpointer or the startup
6399 : * process.
6400 : */
846 michael 6401 CBC 4726 : if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
846 michael 6402 GIC 2766 : return;
6403 :
6404 1960 : if (reset)
6405 980 : set_ps_display("");
6406 : else
846 michael 6407 ECB : {
6408 : char activitymsg[128];
6409 :
846 michael 6410 GIC 2940 : snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
6411 980 : (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
6412 980 : (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
6413 : restartpoint ? "restartpoint" : "checkpoint");
6414 980 : set_ps_display(activitymsg);
6415 : }
846 michael 6416 ECB : }
6417 :
6418 :
6419 : /*
6420 : * Perform a checkpoint --- either during shutdown, or on-the-fly
7500 tgl 6421 : *
6422 : * flags is a bitwise OR of the following:
6423 : * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6424 : * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
5764 6425 : * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
5730 6426 : * ignoring checkpoint_completion_target parameter.
6427 : * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
6428 : * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6429 : * CHECKPOINT_END_OF_RECOVERY).
6430 : * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
6431 : *
5762 6432 : * Note: flags contains other bits, of interest here only for logging purposes.
5764 6433 : * In particular note that this routine is synchronous and does not pay
6434 : * attention to CHECKPOINT_WAIT.
3780 simon 6435 : *
6436 : * If !shutdown then we are writing an online checkpoint. This is a very special
6437 : * kind of operation and WAL record because the checkpoint action occurs over
6438 : * a period of time yet logically occurs at just a single LSN. The logical
6439 : * position of the WAL record (redo ptr) is the same or earlier than the
6440 : * physical position. When we replay WAL we locate the checkpoint via its
6441 : * physical position then read the redo ptr and actually start replay at the
6442 : * earlier logical position. Note that we don't write *anything* to WAL at
6443 : * the logical position, so that location could be any other kind of WAL record.
6444 : * All of this mechanism allows us to continue working while we checkpoint.
6445 : * As a result, timing of actions is critical here and be careful to note that
3780 simon 6446 EUB : * this function will likely take minutes to execute on a busy system.
8062 tgl 6447 : */
8595 vadim4o 6448 ECB : void
5764 tgl 6449 GIC 2340 : CreateCheckPoint(int flags)
6450 : {
6451 : bool shutdown;
6452 : CheckPoint checkPoint;
6453 : XLogRecPtr recptr;
6454 : XLogSegNo _logSegNo;
8397 bruce 6455 CBC 2340 : XLogCtlInsert *Insert = &XLogCtl->Insert;
8397 bruce 6456 ECB : uint32 freespace;
6457 : XLogRecPtr PriorRedoPtr;
6458 : XLogRecPtr curInsert;
6459 : XLogRecPtr last_important_lsn;
6460 : VirtualTransactionId *vxids;
6461 : int nvxids;
531 rhaas 6462 GIC 2340 : int oldXLogAllowed = 0;
6463 :
6464 : /*
6465 : * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6466 : * issued at a different time.
6467 : */
5035 tgl 6468 2340 : if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
5036 heikki.linnakangas 6469 967 : shutdown = true;
6470 : else
6471 1373 : shutdown = false;
6472 :
6473 : /* sanity check */
5035 tgl 6474 2340 : if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
5035 tgl 6475 UIC 0 : elog(ERROR, "can't create a checkpoint during recovery");
6476 :
6477 : /*
6478 : * Prepare to accumulate statistics.
5762 tgl 6479 ECB : *
6480 : * Note: because it is possible for log_checkpoints to change while a
6481 : * checkpoint proceeds, we always accumulate stats, even if
6482 : * log_checkpoints is currently off.
6483 : */
5762 tgl 6484 GIC 25740 : MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6485 2340 : CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6486 :
6487 : /*
6488 : * Let smgr prepare for checkpoint; this has to happen outside the
6489 : * critical section and before we determine the REDO pointer. Note that
6490 : * smgr must not do anything that'd have to be undone if we decide no
6491 : * checkpoint is needed.
6492 : */
389 tmunro 6493 2340 : SyncPreCheckpoint();
6494 :
6495 : /*
6496 : * Use a critical section to force system panic if we have trouble.
6497 : */
7862 tgl 6498 2340 : START_CRIT_SECTION();
6499 :
8595 vadim4o 6500 2340 : if (shutdown)
6501 : {
5163 heikki.linnakangas 6502 967 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8595 vadim4o 6503 967 : ControlFile->state = DB_SHUTDOWNING;
6504 967 : UpdateControlFile();
5163 heikki.linnakangas 6505 967 : LWLockRelease(ControlFileLock);
6506 : }
6507 :
6508 : /* Begin filling in the checkpoint WAL record */
7262 tgl 6509 28080 : MemSet(&checkPoint, 0, sizeof(checkPoint));
5530 tgl 6510 CBC 2340 : checkPoint.time = (pg_time_t) time(NULL);
6511 :
6512 : /*
6513 : * For Hot Standby, derive the oldestActiveXid before we fix the redo
6514 : * pointer. This allows us to begin accumulating changes to assemble our
6515 : * starting snapshot of locks and transactions.
6516 : */
4176 simon 6517 GIC 2340 : if (!shutdown && XLogStandbyInfoActive())
6518 1353 : checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
6519 : else
6520 987 : checkPoint.oldestActiveXid = InvalidTransactionId;
6521 :
6522 : /*
6523 : * Get location of last important record before acquiring insert locks (as
6524 : * GetLastImportantRecPtr() also locks WAL locks).
6525 : */
2299 andres 6526 2340 : last_important_lsn = GetLastImportantRecPtr();
6527 :
6528 : /*
6529 : * We must block concurrent insertions while examining insert state to
6530 : * determine the checkpoint REDO pointer.
6531 : */
3306 heikki.linnakangas 6532 2340 : WALInsertLockAcquireExclusive();
3562 6533 2340 : curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8062 tgl 6534 ECB :
6535 : /*
2299 andres 6536 : * If this isn't a shutdown or forced checkpoint, and if there has been no
6537 : * WAL activity requiring a checkpoint, skip it. The idea here is to
6538 : * avoid inserting duplicate checkpoints when the system is idle.
8062 tgl 6539 : */
5036 heikki.linnakangas 6540 CBC 2340 : if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6541 : CHECKPOINT_FORCE)) == 0)
6542 : {
2299 andres 6543 GIC 28 : if (last_important_lsn == ControlFile->checkPoint)
6544 : {
3306 heikki.linnakangas 6545 5 : WALInsertLockRelease();
8062 tgl 6546 5 : END_CRIT_SECTION();
2299 andres 6547 5 : ereport(DEBUG1,
781 peter 6548 ECB : (errmsg_internal("checkpoint skipped because system is idle")));
8062 tgl 6549 GIC 5 : return;
6550 : }
6551 : }
6552 :
6553 : /*
6554 : * An end-of-recovery checkpoint is created before anyone is allowed to
6555 : * write WAL. To allow us to write the checkpoint record, temporarily
6556 : * enable XLogInsertAllowed.
4973 heikki.linnakangas 6557 ECB : */
4973 heikki.linnakangas 6558 CBC 2335 : if (flags & CHECKPOINT_END_OF_RECOVERY)
531 rhaas 6559 GIC 27 : oldXLogAllowed = LocalSetXLogInsertAllowed();
4973 heikki.linnakangas 6560 ECB :
515 rhaas 6561 CBC 2335 : checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
3709 heikki.linnakangas 6562 GIC 2335 : if (flags & CHECKPOINT_END_OF_RECOVERY)
6563 27 : checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
6564 : else
520 rhaas 6565 2308 : checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
3709 heikki.linnakangas 6566 ECB :
4092 simon 6567 CBC 2335 : checkPoint.fullPageWrites = Insert->fullPageWrites;
4973 heikki.linnakangas 6568 ECB :
6569 : /*
8062 tgl 6570 : * Compute new REDO record ptr = location of next XLOG record.
6571 : *
6572 : * NB: this is NOT necessarily where the checkpoint record itself will be,
6573 : * since other backends may insert more XLOG records while we're off doing
6574 : * the buffer flush work. Those XLOG records are logically after the
6575 : * checkpoint, even though physically before it. Got that?
6576 : */
3562 heikki.linnakangas 6577 GIC 2335 : freespace = INSERT_FREESPACE(curInsert);
3941 6578 2335 : if (freespace == 0)
6579 : {
2028 andres 6580 UIC 0 : if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
3562 heikki.linnakangas 6581 0 : curInsert += SizeOfXLogLongPHD;
6582 : else
6583 0 : curInsert += SizeOfXLogShortPHD;
6584 : }
3562 heikki.linnakangas 6585 GIC 2335 : checkPoint.redo = curInsert;
6586 :
6587 : /*
6588 : * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6589 : * must be done while holding all the insertion locks.
6590 : *
6591 : * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6592 : * pointing past where it really needs to point. This is okay; the only
6593 : * consequence is that XLogInsert might back up whole buffers that it
6594 : * didn't really need to. We can't postpone advancing RedoRecPtr because
6595 : * XLogInserts that happen while we are dumping buffers must assume that
6596 : * their buffer changes are not included in the checkpoint.
6597 : */
3121 andres 6598 2335 : RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6599 :
6600 : /*
6601 : * Now we can release the WAL insertion locks, allowing other xacts to
6602 : * proceed while we are flushing disk buffers.
6603 : */
3306 heikki.linnakangas 6604 2335 : WALInsertLockRelease();
3562 heikki.linnakangas 6605 ECB :
6606 : /* Update the info_lck-protected copy of RedoRecPtr as well */
3121 andres 6607 GIC 2335 : SpinLockAcquire(&XLogCtl->info_lck);
6608 2335 : XLogCtl->RedoRecPtr = checkPoint.redo;
6609 2335 : SpinLockRelease(&XLogCtl->info_lck);
6610 :
5762 tgl 6611 ECB : /*
6612 : * If enabled, log checkpoint start. We postpone this until now so as not
6613 : * to log anything if we decided to skip the checkpoint.
6614 : */
5762 tgl 6615 GIC 2335 : if (log_checkpoints)
5163 heikki.linnakangas 6616 513 : LogCheckpointStart(flags, false);
6617 :
846 michael 6618 ECB : /* Update the process title */
846 michael 6619 GIC 2335 : update_checkpoint_display(flags, false, false);
6620 :
6621 : TRACE_POSTGRESQL_CHECKPOINT_START(flags);
6622 :
6623 : /*
3292 heikki.linnakangas 6624 ECB : * Get the other info we need for the checkpoint record.
2208 rhaas 6625 : *
6626 : * We don't need to save oldestClogXid in the checkpoint, it only matters
6627 : * for the short period in which clog is being truncated, and if we crash
6628 : * during that we'll redo the clog truncation and fix up oldestClogXid
6629 : * there.
3292 heikki.linnakangas 6630 : */
3292 heikki.linnakangas 6631 GBC 2335 : LWLockAcquire(XidGenLock, LW_SHARED);
971 andres 6632 GIC 2335 : checkPoint.nextXid = ShmemVariableCache->nextXid;
3292 heikki.linnakangas 6633 2335 : checkPoint.oldestXid = ShmemVariableCache->oldestXid;
6634 2335 : checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
6635 2335 : LWLockRelease(XidGenLock);
6636 :
3049 alvherre 6637 2335 : LWLockAcquire(CommitTsLock, LW_SHARED);
2659 mail 6638 2335 : checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
6639 2335 : checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
3049 alvherre 6640 CBC 2335 : LWLockRelease(CommitTsLock);
3049 alvherre 6641 ECB :
3292 heikki.linnakangas 6642 GIC 2335 : LWLockAcquire(OidGenLock, LW_SHARED);
6643 2335 : checkPoint.nextOid = ShmemVariableCache->nextOid;
6644 2335 : if (!shutdown)
6645 1368 : checkPoint.nextOid += ShmemVariableCache->oidCount;
6646 2335 : LWLockRelease(OidGenLock);
6647 :
6648 2335 : MultiXactGetCheckptMulti(shutdown,
3292 heikki.linnakangas 6649 ECB : &checkPoint.nextMulti,
6650 : &checkPoint.nextMultiOffset,
6651 : &checkPoint.oldestMulti,
6652 : &checkPoint.oldestMultiDB);
6653 :
6654 : /*
6655 : * Having constructed the checkpoint record, ensure all shmem disk buffers
6656 : * and commit-log buffers are flushed to disk.
6657 : *
6658 : * This I/O could fail for various reasons. If so, we will fail to
6659 : * complete the checkpoint, but there is no reason to force a system
6660 : * panic. Accordingly, exit critical section while doing it.
6661 : */
3292 heikki.linnakangas 6662 GIC 2335 : END_CRIT_SECTION();
6663 :
6664 : /*
3602 bruce 6665 ECB : * In some cases there are groups of actions that must all occur on one
6666 : * side or the other of a checkpoint record. Before flushing the
6667 : * checkpoint record we must explicitly wait for any backend currently
6668 : * performing those groups of actions.
6669 : *
6670 : * One example is end of transaction, so we must wait for any transactions
6671 : * that are currently in commit critical sections. If an xact inserted
6672 : * its commit record into XLOG just before the REDO point, then a crash
5850 tgl 6673 : * restart from the REDO point would not replay that record, which means
2214 rhaas 6674 : * that our flushing had better include the xact's update of pg_xact. So
6675 : * we wait till he's out of his commit critical section before proceeding.
5850 tgl 6676 : * See notes in RecordTransactionCommit().
6677 : *
6678 : * Because we've already released the insertion locks, this test is a bit
6679 : * fuzzy: it is possible that we will wait for xacts we didn't really need
6680 : * to wait for. But the delay should be short and it seems better to make
6681 : * checkpoint take a bit longer than to hold off insertions longer than
3260 bruce 6682 : * necessary. (In fact, the whole reason we have this issue is that xact.c
6683 : * does commit record XLOG insertion and clog update as two separate steps
6684 : * protected by different locks, but again that seems best on grounds of
6685 : * minimizing lock contention.)
6686 : *
6687 : * A transaction that has not yet set delayChkptFlags when we look cannot
366 rhaas 6688 : * be at risk, since it has not inserted its commit record yet; and one
6689 : * that's already cleared it is not at risk either, since it's done fixing
6690 : * clog and we will correctly flush the update below. So we cannot miss
6691 : * any xacts we need to wait for.
6692 : */
381 rhaas 6693 GIC 2335 : vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
3779 simon 6694 2335 : if (nvxids > 0)
6695 : {
5624 bruce 6696 ECB : do
6697 : {
5624 bruce 6698 GIC 9 : pg_usleep(10000L); /* wait for 10 msec */
381 rhaas 6699 CBC 9 : } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
6700 : DELAY_CHKPT_START));
5850 tgl 6701 ECB : }
3779 simon 6702 CBC 2335 : pfree(vxids);
5850 tgl 6703 ECB :
5764 tgl 6704 GIC 2335 : CheckPointGuts(checkPoint.redo, flags);
7897 tgl 6705 ECB :
381 rhaas 6706 GIC 2335 : vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
6707 2335 : if (nvxids > 0)
6708 : {
6709 : do
6710 : {
381 rhaas 6711 UIC 0 : pg_usleep(10000L); /* wait for 10 msec */
6712 0 : } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
6713 : DELAY_CHKPT_COMPLETE));
381 rhaas 6714 ECB : }
381 rhaas 6715 CBC 2335 : pfree(vxids);
6716 :
4859 simon 6717 ECB : /*
4790 bruce 6718 : * Take a snapshot of running transactions and write this to WAL. This
6719 : * allows us to reconstruct the state of running transactions during
6720 : * archive recovery, if required. Skip, if this info disabled.
4859 simon 6721 : *
6722 : * If we are shutting down, or Startup process is completing crash
6723 : * recovery we don't need to write running xact data.
6724 : */
4859 simon 6725 GIC 2335 : if (!shutdown && XLogStandbyInfoActive())
3780 tgl 6726 1348 : LogStandbySnapshot();
6727 :
7274 6728 2335 : START_CRIT_SECTION();
6729 :
6730 : /*
6731 : * Now insert the checkpoint record into XLOG.
6732 : */
3062 heikki.linnakangas 6733 CBC 2335 : XLogBeginInsert();
6734 2335 : XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8062 tgl 6735 GIC 2335 : recptr = XLogInsert(RM_XLOG_ID,
8062 tgl 6736 EUB : shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
3062 heikki.linnakangas 6737 : XLOG_CHECKPOINT_ONLINE);
6738 :
8062 tgl 6739 GBC 2335 : XLogFlush(recptr);
6740 :
5035 tgl 6741 ECB : /*
6742 : * We mustn't write any new WAL after a shutdown checkpoint, or it will be
6743 : * overwritten at next startup. No-one should even try, this just allows
6744 : * sanity-checking. In the case of an end-of-recovery checkpoint, we want
6745 : * to just temporarily disable writing until the system has exited
6746 : * recovery.
6747 : */
5035 tgl 6748 GIC 2335 : if (shutdown)
6749 : {
6750 967 : if (flags & CHECKPOINT_END_OF_RECOVERY)
531 rhaas 6751 27 : LocalXLogInsertAllowed = oldXLogAllowed;
6752 : else
4790 bruce 6753 940 : LocalXLogInsertAllowed = 0; /* never again write WAL */
5035 tgl 6754 ECB : }
6755 :
6756 : /*
6757 : * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
6758 : * = end of actual checkpoint record.
6759 : */
3754 alvherre 6760 CBC 2335 : if (shutdown && checkPoint.redo != ProcLastRecPtr)
7202 tgl 6761 UIC 0 : ereport(PANIC,
6762 : (errmsg("concurrent write-ahead log activity while database system is shutting down")));
8595 vadim4o 6763 ECB :
8062 tgl 6764 : /*
1957 rhaas 6765 : * Remember the prior checkpoint's redo ptr for
6766 : * UpdateCheckPointDistanceEstimate()
6767 : */
2967 heikki.linnakangas 6768 GIC 2335 : PriorRedoPtr = ControlFile->checkPointCopy.redo;
6769 :
6770 : /*
8062 tgl 6771 ECB : * Update the control file.
6772 : */
7862 tgl 6773 GIC 2335 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8595 vadim4o 6774 2335 : if (shutdown)
8595 vadim4o 6775 CBC 967 : ControlFile->state = DB_SHUTDOWNED;
8062 tgl 6776 GIC 2335 : ControlFile->checkPoint = ProcLastRecPtr;
6777 2335 : ControlFile->checkPointCopy = checkPoint;
6778 : /* crash recovery should always recover to the end of WAL */
3755 alvherre 6779 2335 : ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
3778 heikki.linnakangas 6780 2335 : ControlFile->minRecoveryPointTLI = 0;
6781 :
6782 : /*
6783 : * Persist unloggedLSN value. It's reset on crash recovery, so this goes
6784 : * unused on non-shutdown checkpoints, but seems useful to store it always
6785 : * for debugging purposes.
6786 : */
3709 heikki.linnakangas 6787 CBC 2335 : SpinLockAcquire(&XLogCtl->ulsn_lck);
6788 2335 : ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
6789 2335 : SpinLockRelease(&XLogCtl->ulsn_lck);
3709 heikki.linnakangas 6790 ECB :
8595 vadim4o 6791 CBC 2335 : UpdateControlFile();
7862 tgl 6792 GIC 2335 : LWLockRelease(ControlFileLock);
8595 vadim4o 6793 ECB :
6075 tgl 6794 : /* Update shared-memory copy of checkpoint XID/epoch */
3121 andres 6795 CBC 2335 : SpinLockAcquire(&XLogCtl->info_lck);
971 6796 2335 : XLogCtl->ckptFullXid = checkPoint.nextXid;
3121 andres 6797 GIC 2335 : SpinLockRelease(&XLogCtl->info_lck);
6075 tgl 6798 ECB :
7274 6799 : /*
6385 bruce 6800 : * We are now done with critical updates; no need for system panic if we
5764 tgl 6801 : * have trouble while fooling with old log segments.
7274 6802 : */
7274 tgl 6803 GIC 2335 : END_CRIT_SECTION();
7274 tgl 6804 ECB :
6805 : /*
6806 : * Let smgr do post-checkpoint cleanup (eg, deleting old files).
6807 : */
1466 tmunro 6808 GIC 2335 : SyncPostCheckpoint();
6809 :
6810 : /*
6811 : * Update the average distance between checkpoints if the prior checkpoint
6812 : * exists.
6813 : */
2967 heikki.linnakangas 6814 2335 : if (PriorRedoPtr != InvalidXLogRecPtr)
6815 2335 : UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
6816 :
6817 : /*
1720 michael 6818 ECB : * Delete old log files, those no longer needed for last checkpoint to
6819 : * prevent the disk holding the xlog from growing full.
6820 : */
1720 michael 6821 GIC 2335 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
6822 2335 : KeepLogSeg(recptr, &_logSegNo);
2 andres 6823 GNC 2335 : if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
6824 : _logSegNo, InvalidOid,
6825 : InvalidTransactionId))
6826 : {
6827 : /*
6828 : * Some slots have been invalidated; recalculate the old-segment
6829 : * horizon, starting again from RedoRecPtr.
6830 : */
632 alvherre 6831 GIC 3 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
6832 3 : KeepLogSeg(recptr, &_logSegNo);
6833 : }
1720 michael 6834 2335 : _logSegNo--;
520 rhaas 6835 2335 : RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
6836 : checkPoint.ThisTimeLineID);
6837 :
6838 : /*
6839 : * Make more log segments if needed. (Do this after recycling old log
6840 : * segments, since that may supply some of the needed files.)
6841 : */
8062 tgl 6842 2335 : if (!shutdown)
520 rhaas 6843 1368 : PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
6844 :
6845 : /*
6846 : * Truncate pg_subtrans if possible. We can throw away all data before
6847 : * the oldest XMIN of any running transaction. No future transaction will
6848 : * attempt to reference any pg_subtrans entry older than that (see Asserts
6849 : * in subtrans.c). During recovery, though, we mustn't do this because
6850 : * StartupSUBTRANS hasn't been called yet.
6803 tgl 6851 ECB : */
5035 tgl 6852 CBC 2335 : if (!RecoveryInProgress())
970 andres 6853 GIC 2308 : TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
6854 :
6855 : /* Real work is done; log and update stats. */
4021 rhaas 6856 CBC 2335 : LogCheckpointEnd(false);
6736 tgl 6857 ECB :
6858 : /* Reset the process title */
846 michael 6859 GIC 2335 : update_checkpoint_display(flags, false, true);
846 michael 6860 ECB :
6861 : TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
5142 tgl 6862 : NBuffers,
6863 : CheckpointStats.ckpt_segs_added,
6864 : CheckpointStats.ckpt_segs_removed,
6865 : CheckpointStats.ckpt_segs_recycled);
6866 : }
6867 :
6868 : /*
3722 simon 6869 EUB : * Mark the end of recovery in WAL though without running a full checkpoint.
6870 : * We can expect that a restartpoint is likely to be in progress as we
6871 : * do this, though we are unwilling to wait for it to complete.
6872 : *
3722 simon 6873 ECB : * CreateRestartPoint() allows for the case where recovery may end before
6874 : * the restartpoint completes so there is no concern of concurrent behaviour.
6875 : */
6876 : static void
3722 simon 6877 GIC 36 : CreateEndOfRecoveryRecord(void)
6878 : {
6879 : xl_end_of_recovery xlrec;
6880 : XLogRecPtr recptr;
6881 :
6882 : /* sanity check */
3722 simon 6883 CBC 36 : if (!RecoveryInProgress())
3722 simon 6884 LBC 0 : elog(ERROR, "can only be used to end recovery");
6885 :
3033 heikki.linnakangas 6886 CBC 36 : xlrec.end_time = GetCurrentTimestamp();
6887 :
3306 heikki.linnakangas 6888 GIC 36 : WALInsertLockAcquireExclusive();
515 rhaas 6889 36 : xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
3709 heikki.linnakangas 6890 36 : xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
3306 heikki.linnakangas 6891 CBC 36 : WALInsertLockRelease();
3722 simon 6892 ECB :
3722 simon 6893 CBC 36 : START_CRIT_SECTION();
6894 :
3062 heikki.linnakangas 6895 GIC 36 : XLogBeginInsert();
6896 36 : XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
3062 heikki.linnakangas 6897 CBC 36 : recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
6898 :
3720 simon 6899 GIC 36 : XLogFlush(recptr);
6900 :
6901 : /*
6902 : * Update the control file so that crash recovery can follow the timeline
6903 : * changes to this point.
6904 : */
6905 36 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3720 simon 6906 CBC 36 : ControlFile->minRecoveryPoint = recptr;
520 rhaas 6907 GIC 36 : ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
3720 simon 6908 CBC 36 : UpdateControlFile();
6909 36 : LWLockRelease(ControlFileLock);
6910 :
3722 6911 36 : END_CRIT_SECTION();
3722 simon 6912 GIC 36 : }
6913 :
6914 : /*
6915 : * Write an OVERWRITE_CONTRECORD message.
6916 : *
6917 : * When on WAL replay we expect a continuation record at the start of a page
557 alvherre 6918 ECB : * that is not there, recovery ends and WAL writing resumes at that point.
557 alvherre 6919 EUB : * But it's wrong to resume writing new WAL back at the start of the record
6920 : * that was broken, because downstream consumers of that WAL (physical
6921 : * replicas) are not prepared to "rewind". So the first action after
6922 : * finishing replay of all valid WAL must be to write a record of this type
6923 : * at the point where the contrecord was missing; to support xlogreader
6924 : * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
6925 : * to the page header where the record occurs. xlogreader has an ad-hoc
557 alvherre 6926 ECB : * mechanism to report metadata about the broken record, which is what we
6927 : * use here.
6928 : *
6929 : * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
6930 : * skip the record it was reading, and pass back the LSN of the skipped
6931 : * record, so that its caller can verify (on "replay" of that record) that the
6932 : * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
417 heikki.linnakangas 6933 : *
6934 : * 'aborted_lsn' is the beginning position of the record that was incomplete.
6935 : * It is included in the WAL record. 'pagePtr' and 'newTLI' point to the
6936 : * beginning of the XLOG page where the record is to be inserted. They must
6937 : * match the current WAL insert position, they're passed here just so that we
6938 : * can verify that.
6939 : */
6940 : static XLogRecPtr
417 heikki.linnakangas 6941 GIC 1 : CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
6942 : TimeLineID newTLI)
6943 : {
6944 : xl_overwrite_contrecord xlrec;
557 alvherre 6945 ECB : XLogRecPtr recptr;
417 heikki.linnakangas 6946 : XLogPageHeader pagehdr;
6947 : XLogRecPtr startPos;
6948 :
6949 : /* sanity checks */
557 alvherre 6950 CBC 1 : if (!RecoveryInProgress())
557 alvherre 6951 UIC 0 : elog(ERROR, "can only be used at end of recovery");
417 heikki.linnakangas 6952 GIC 1 : if (pagePtr % XLOG_BLCKSZ != 0)
417 heikki.linnakangas 6953 LBC 0 : elog(ERROR, "invalid position for missing continuation record %X/%X",
417 heikki.linnakangas 6954 ECB : LSN_FORMAT_ARGS(pagePtr));
6955 :
6956 : /* The current WAL insert position should be right after the page header */
417 heikki.linnakangas 6957 GIC 1 : startPos = pagePtr;
6958 1 : if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
6959 1 : startPos += SizeOfXLogLongPHD;
6960 : else
417 heikki.linnakangas 6961 LBC 0 : startPos += SizeOfXLogShortPHD;
417 heikki.linnakangas 6962 GIC 1 : recptr = GetXLogInsertRecPtr();
6963 1 : if (recptr != startPos)
417 heikki.linnakangas 6964 UIC 0 : elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
6965 : LSN_FORMAT_ARGS(recptr));
557 alvherre 6966 ECB :
557 alvherre 6967 GIC 1 : START_CRIT_SECTION();
6968 :
6969 : /*
6970 : * Initialize the XLOG page header (by GetXLogBuffer), and set the
6971 : * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
417 heikki.linnakangas 6972 ECB : *
6973 : * No other backend is allowed to write WAL yet, so acquiring the WAL
6974 : * insertion lock is just pro forma.
6975 : */
417 heikki.linnakangas 6976 GIC 1 : WALInsertLockAcquire();
6977 1 : pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
6978 1 : pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
417 heikki.linnakangas 6979 CBC 1 : WALInsertLockRelease();
417 heikki.linnakangas 6980 ECB :
6981 : /*
6982 : * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
6983 : * page. We know it becomes the first record, because no other backend is
6984 : * allowed to write WAL yet.
6985 : */
557 alvherre 6986 GIC 1 : XLogBeginInsert();
417 heikki.linnakangas 6987 1 : xlrec.overwritten_lsn = aborted_lsn;
6988 1 : xlrec.overwrite_time = GetCurrentTimestamp();
557 alvherre 6989 CBC 1 : XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
6990 1 : recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
6991 :
417 heikki.linnakangas 6992 ECB : /* check that the record was inserted to the right place */
417 heikki.linnakangas 6993 CBC 1 : if (ProcLastRecPtr != startPos)
417 heikki.linnakangas 6994 UIC 0 : elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
6995 : LSN_FORMAT_ARGS(ProcLastRecPtr));
6996 :
557 alvherre 6997 GIC 1 : XLogFlush(recptr);
6998 :
6999 1 : END_CRIT_SECTION();
557 alvherre 7000 ECB :
557 alvherre 7001 CBC 1 : return recptr;
7002 : }
7003 :
7004 : /*
7005 : * Flush all data in shared memory to disk, and fsync
7006 : *
7007 : * This is the common code shared between regular checkpoints and
7008 : * recovery restartpoints.
7009 : */
6089 tgl 7010 ECB : static void
5764 tgl 7011 CBC 2363 : CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7012 : {
4809 tgl 7013 GIC 2363 : CheckPointRelationMap();
3355 rhaas 7014 CBC 2363 : CheckPointReplicationSlots();
3324 rhaas 7015 GIC 2363 : CheckPointSnapBuild();
7016 2363 : CheckPointLogicalRewriteHeap();
2902 andres 7017 CBC 2363 : CheckPointReplicationOrigin();
7018 :
7019 : /* Write out all dirty data in SLRUs and the main buffer pool */
7020 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
926 tmunro 7021 GIC 2363 : CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
7022 2363 : CheckPointCLOG();
7023 2363 : CheckPointCommitTs();
7024 2363 : CheckPointSUBTRANS();
7025 2363 : CheckPointMultiXact();
7026 2363 : CheckPointPredicate();
7027 2363 : CheckPointBuffers(flags);
7028 :
7029 : /* Perform all queued up fsyncs */
7030 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
7031 2363 : CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
7032 2363 : ProcessSyncRequests();
7033 2363 : CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
7034 : TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
926 tmunro 7035 ECB :
7036 : /* We deliberately delay 2PC checkpointing as long as possible */
6089 tgl 7037 GIC 2363 : CheckPointTwoPhase(checkPointRedo);
7038 2363 : }
7039 :
7040 : /*
5035 tgl 7041 ECB : * Save a checkpoint for recovery restart if appropriate
5035 tgl 7042 EUB : *
7043 : * This function is called each time a checkpoint record is read from XLOG.
5035 tgl 7044 ECB : * It must determine whether the checkpoint represents a safe restartpoint or
7045 : * not. If so, the checkpoint record is stashed in shared memory so that
7046 : * CreateRestartPoint can consult it. (Note that the latter function is
4176 simon 7047 : * executed by the checkpointer, while this one will be executed by the
7048 : * startup process.)
6089 tgl 7049 : */
7050 : static void
501 rhaas 7051 CBC 166 : RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
7052 : {
4146 heikki.linnakangas 7053 ECB : /*
3955 bruce 7054 : * Also refrain from creating a restartpoint if we have seen any
7055 : * references to non-existent pages. Restarting recovery from the
7056 : * restartpoint would not see the references, so we would lose the
7057 : * cross-check that the pages belonged to a relation that was dropped
7058 : * later.
7059 : */
4146 heikki.linnakangas 7060 GIC 166 : if (XLogHaveInvalidPages())
7061 : {
4146 heikki.linnakangas 7062 UIC 0 : elog(trace_recovery(DEBUG2),
4146 heikki.linnakangas 7063 ECB : "could not record restart point at %X/%X because there "
7064 : "are unresolved references to invalid pages",
775 peter 7065 : LSN_FORMAT_ARGS(checkPoint->redo));
4146 heikki.linnakangas 7066 LBC 0 : return;
4146 heikki.linnakangas 7067 ECB : }
7068 :
6089 tgl 7069 : /*
3955 bruce 7070 : * Copy the checkpoint record to shared memory, so that checkpointer can
7071 : * work out the next time it wants to perform a restartpoint.
7072 : */
3121 andres 7073 GIC 166 : SpinLockAcquire(&XLogCtl->info_lck);
501 rhaas 7074 166 : XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
7075 166 : XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
3121 andres 7076 166 : XLogCtl->lastCheckPoint = *checkPoint;
7077 166 : SpinLockRelease(&XLogCtl->info_lck);
7078 : }
7079 :
7080 : /*
7081 : * Establish a restartpoint if possible.
7082 : *
7083 : * This is similar to CreateCheckPoint, but is used during WAL recovery
7084 : * to establish a point from which recovery can roll forward without
7085 : * replaying the entire recovery log.
7086 : *
7087 : * Returns true if a new restartpoint was established. We can only establish
7088 : * a restartpoint if we have replayed a safe checkpoint record since last
7089 : * restartpoint.
7090 : */
7091 : bool
5163 heikki.linnakangas 7092 71 : CreateRestartPoint(int flags)
7093 : {
7094 : XLogRecPtr lastCheckPointRecPtr;
7095 : XLogRecPtr lastCheckPointEndPtr;
7096 : CheckPoint lastCheckPoint;
7097 : XLogRecPtr PriorRedoPtr;
7098 : XLogRecPtr receivePtr;
1720 michael 7099 ECB : XLogRecPtr replayPtr;
7100 : TimeLineID replayTLI;
7101 : XLogRecPtr endptr;
7102 : XLogSegNo _logSegNo;
7103 : TimestampTz xtime;
7104 :
7105 : /* Concurrent checkpoint/restartpoint cannot happen */
335 michael 7106 GIC 71 : Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
7107 :
5035 tgl 7108 ECB : /* Get a local copy of the last safe checkpoint record. */
3121 andres 7109 GBC 71 : SpinLockAcquire(&XLogCtl->info_lck);
3121 andres 7110 CBC 71 : lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
2355 rhaas 7111 GBC 71 : lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
3121 andres 7112 GIC 71 : lastCheckPoint = XLogCtl->lastCheckPoint;
7113 71 : SpinLockRelease(&XLogCtl->info_lck);
7114 :
5050 bruce 7115 ECB : /*
5163 heikki.linnakangas 7116 : * Check that we're still in recovery mode. It's ok if we exit recovery
7117 : * mode after this check, the restart point is valid anyway.
7118 : */
5163 heikki.linnakangas 7119 GBC 71 : if (!RecoveryInProgress())
5163 heikki.linnakangas 7120 ECB : {
5163 heikki.linnakangas 7121 LBC 0 : ereport(DEBUG2,
781 peter 7122 EUB : (errmsg_internal("skipping restartpoint, recovery has already ended")));
5163 heikki.linnakangas 7123 UIC 0 : return false;
7124 : }
5163 heikki.linnakangas 7125 ECB :
7126 : /*
7127 : * If the last checkpoint record we've replayed is already our last
7128 : * restartpoint, we can't perform a new restart point. We still update
7129 : * minRecoveryPoint in that case, so that if this is a shutdown restart
7130 : * point, we won't start up earlier than before. That's not strictly
7131 : * necessary, but when hot standby is enabled, it would be rather weird if
7132 : * the database opened up for read-only connections at a point-in-time
7133 : * before the last shutdown. Such time travel is still possible in case of
4660 bruce 7134 : * immediate shutdown, though.
5163 heikki.linnakangas 7135 : *
7136 : * We don't explicitly advance minRecoveryPoint when we do create a
5050 bruce 7137 : * restartpoint. It's assumed that flushing the buffers will do that as a
7138 : * side-effect.
7139 : */
5163 heikki.linnakangas 7140 GIC 71 : if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
3754 alvherre 7141 69 : lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7142 : {
5163 heikki.linnakangas 7143 43 : ereport(DEBUG2,
781 peter 7144 ECB : (errmsg_internal("skipping restartpoint, already performed at %X/%X",
775 7145 : LSN_FORMAT_ARGS(lastCheckPoint.redo))));
5163 heikki.linnakangas 7146 :
5163 heikki.linnakangas 7147 CBC 43 : UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4693 rhaas 7148 43 : if (flags & CHECKPOINT_IS_SHUTDOWN)
7149 : {
4693 rhaas 7150 GIC 18 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4693 rhaas 7151 CBC 18 : ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
4693 rhaas 7152 GBC 18 : UpdateControlFile();
4693 rhaas 7153 GIC 18 : LWLockRelease(ControlFileLock);
7154 : }
5163 heikki.linnakangas 7155 CBC 43 : return false;
7156 : }
5163 heikki.linnakangas 7157 ECB :
7158 : /*
4660 bruce 7159 : * Update the shared RedoRecPtr so that the startup process can calculate
7160 : * the number of segments replayed since last restartpoint, and request a
7161 : * restartpoint if it exceeds CheckPointSegments.
7162 : *
7163 : * Like in CreateCheckPoint(), hold off insertions to update it, although
7164 : * during recovery this is just pro forma, because no WAL insertions are
7165 : * happening.
7166 : */
3306 heikki.linnakangas 7167 GIC 28 : WALInsertLockAcquireExclusive();
2967 7168 28 : RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
3306 heikki.linnakangas 7169 CBC 28 : WALInsertLockRelease();
7170 :
3562 heikki.linnakangas 7171 ECB : /* Also update the info_lck-protected copy */
3121 andres 7172 CBC 28 : SpinLockAcquire(&XLogCtl->info_lck);
7173 28 : XLogCtl->RedoRecPtr = lastCheckPoint.redo;
7174 28 : SpinLockRelease(&XLogCtl->info_lck);
4687 heikki.linnakangas 7175 ECB :
7176 : /*
7177 : * Prepare to accumulate statistics.
7178 : *
4449 rhaas 7179 : * Note: because it is possible for log_checkpoints to change while a
7180 : * checkpoint proceeds, we always accumulate stats, even if
7181 : * log_checkpoints is currently off.
7182 : */
4449 rhaas 7183 CBC 308 : MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7184 28 : CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
5163 heikki.linnakangas 7185 ECB :
4449 rhaas 7186 GIC 28 : if (log_checkpoints)
5163 heikki.linnakangas 7187 28 : LogCheckpointStart(flags, true);
7188 :
846 michael 7189 ECB : /* Update the process title */
846 michael 7190 CBC 28 : update_checkpoint_display(flags, true, false);
846 michael 7191 ECB :
5163 heikki.linnakangas 7192 GIC 28 : CheckPointGuts(lastCheckPoint.redo, flags);
7193 :
7194 : /*
1957 rhaas 7195 ECB : * Remember the prior checkpoint's redo ptr for
7196 : * UpdateCheckPointDistanceEstimate()
7197 : */
2967 heikki.linnakangas 7198 GIC 28 : PriorRedoPtr = ControlFile->checkPointCopy.redo;
7199 :
7200 : /*
7201 : * Update pg_control, using current time. Check that it still shows an
7202 : * older checkpoint, else do nothing; this is a quick hack to make sure
7203 : * nothing really bad happens if somehow we get here after the
7204 : * end-of-recovery checkpoint.
7205 : */
5163 7206 28 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
335 michael 7207 28 : if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7208 : {
335 michael 7209 ECB : /*
7210 : * Update the checkpoint information. We do this even if the cluster
7211 : * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
7212 : * segments recycled below.
7213 : */
5035 tgl 7214 GIC 28 : ControlFile->checkPoint = lastCheckPointRecPtr;
7215 28 : ControlFile->checkPointCopy = lastCheckPoint;
7216 :
7217 : /*
335 michael 7218 ECB : * Ensure minRecoveryPoint is past the checkpoint record and update it
7219 : * if the control file still shows DB_IN_ARCHIVE_RECOVERY. Normally,
2355 rhaas 7220 EUB : * this will have happened already while writing out dirty buffers,
7221 : * but not necessarily - e.g. because no buffers were dirtied. We do
7222 : * this because a backup performed in recovery uses minRecoveryPoint
7223 : * to determine which WAL files must be included in the backup, and
335 michael 7224 : * the file (or files) containing the checkpoint record must be
7225 : * included, at a minimum. Note that for an ordinary restart of
7226 : * recovery there's no value in having the minimum recovery point any
7227 : * earlier than this anyway, because redo will begin just after the
7228 : * checkpoint record.
7229 : */
335 michael 7230 GIC 28 : if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
2355 rhaas 7231 ECB : {
335 michael 7232 CBC 28 : if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
335 michael 7233 ECB : {
335 michael 7234 CBC 6 : ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
7235 6 : ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
7236 :
7237 : /* update local copy */
335 michael 7238 GIC 6 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
7239 6 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7240 : }
7241 28 : if (flags & CHECKPOINT_IS_SHUTDOWN)
7242 13 : ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7243 : }
5035 tgl 7244 28 : UpdateControlFile();
7245 : }
5163 heikki.linnakangas 7246 28 : LWLockRelease(ControlFileLock);
7247 :
7248 : /*
7249 : * Update the average distance between checkpoints/restartpoints if the
1720 michael 7250 ECB : * prior checkpoint exists.
7251 : */
2967 heikki.linnakangas 7252 GIC 28 : if (PriorRedoPtr != InvalidXLogRecPtr)
7253 28 : UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7254 :
7255 : /*
7256 : * Delete old log files, those no longer needed for last restartpoint to
7257 : * prevent the disk holding the xlog from growing full.
7258 : */
1720 michael 7259 28 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7260 :
7261 : /*
7262 : * Retreat _logSegNo using the current end of xlog replayed or received,
7263 : * whichever is later.
1720 michael 7264 ECB : */
1096 tmunro 7265 GIC 28 : receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
1720 michael 7266 28 : replayPtr = GetXLogReplayRecPtr(&replayTLI);
1720 michael 7267 CBC 28 : endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7268 28 : KeepLogSeg(endptr, &_logSegNo);
2 andres 7269 GNC 28 : if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
7270 : _logSegNo, InvalidOid,
7271 : InvalidTransactionId))
632 alvherre 7272 ECB : {
7273 : /*
7274 : * Some slots have been invalidated; recalculate the old-segment
7275 : * horizon, starting again from RedoRecPtr.
7276 : */
632 alvherre 7277 UIC 0 : XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7278 0 : KeepLogSeg(endptr, &_logSegNo);
632 alvherre 7279 ECB : }
1720 michael 7280 GIC 28 : _logSegNo--;
3762 heikki.linnakangas 7281 EUB :
7282 : /*
1720 michael 7283 : * Try to recycle segments on a useful timeline. If we've been promoted
7284 : * since the beginning of this restartpoint, use the new timeline chosen
7285 : * at end of recovery. If we're still in recovery, use the timeline we're
7286 : * currently replaying.
7287 : *
7288 : * There is no guarantee that the WAL segments will be useful on the
7289 : * current timeline; if recovery proceeds to a new timeline right after
7290 : * this, the pre-allocated WAL segments on this timeline will not be used,
7291 : * and will go wasted until recycled on the next restartpoint. We'll live
7292 : * with that.
7293 : */
520 rhaas 7294 GIC 28 : if (!RecoveryInProgress())
515 rhaas 7295 UIC 0 : replayTLI = XLogCtl->InsertTimeLineID;
7296 :
520 rhaas 7297 GIC 28 : RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
7298 :
7299 : /*
1720 michael 7300 ECB : * Make more log segments if needed. (Do this after recycling old log
7301 : * segments, since that may supply some of the needed files.)
7302 : */
520 rhaas 7303 CBC 28 : PreallocXlogFiles(endptr, replayTLI);
7304 :
7305 : /*
7306 : * Truncate pg_subtrans if possible. We can throw away all data before
3260 bruce 7307 ECB : * the oldest XMIN of any running transaction. No future transaction will
4605 simon 7308 : * attempt to reference any pg_subtrans entry older than that (see Asserts
7309 : * in subtrans.c). When hot standby is disabled, though, we mustn't do
7310 : * this because StartupSUBTRANS hasn't been called yet.
5163 heikki.linnakangas 7311 : */
4605 simon 7312 CBC 28 : if (EnableHotStandby)
970 andres 7313 28 : TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7314 :
618 heikki.linnakangas 7315 ECB : /* Real work is done; log and update stats. */
4021 rhaas 7316 GIC 28 : LogCheckpointEnd(true);
7317 :
7318 : /* Reset the process title */
846 michael 7319 28 : update_checkpoint_display(flags, true, true);
7320 :
4663 tgl 7321 28 : xtime = GetLatestXTime();
5163 heikki.linnakangas 7322 28 : ereport((log_checkpoints ? LOG : DEBUG2),
7323 : (errmsg("recovery restart point at %X/%X",
7324 : LSN_FORMAT_ARGS(lastCheckPoint.redo)),
7325 : xtime ? errdetail("Last completed transaction was at log time %s.",
7326 : timestamptz_to_str(xtime)) : 0));
5163 heikki.linnakangas 7327 ECB :
4770 7328 : /*
4686 itagaki.takahiro 7329 : * Finally, execute archive_cleanup_command, if any.
7330 : */
1596 peter_e 7331 GIC 28 : if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
62 michael 7332 LBC 0 : ExecuteRecoveryCommand(archiveCleanupCommand,
62 michael 7333 ECB : "archive_cleanup_command",
7334 : false,
7335 : WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
7336 :
5163 heikki.linnakangas 7337 GIC 28 : return true;
7338 : }
7339 :
7340 : /*
7341 : * Report availability of WAL for the given target LSN
7342 : * (typically a slot's restart_lsn)
1097 alvherre 7343 ECB : *
7344 : * Returns one of the following enum values:
7345 : *
1019 7346 : * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
7347 : * max_wal_size.
7348 : *
7349 : * * WALAVAIL_EXTENDED means it is still available by preserving extra
1097 7350 : * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
7351 : * than max_wal_size, this state is not returned.
7352 : *
7353 : * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
7354 : * remove reserved segments. The walsender using this slot may return to the
7355 : * above.
7356 : *
7357 : * * WALAVAIL_REMOVED means it has been removed. A replication stream on
7358 : * a slot with this LSN cannot continue. (Any associated walsender
7359 : * processes should have been terminated already.)
7360 : *
7361 : * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
7362 : */
7363 : WALAvailability
1097 alvherre 7364 GIC 305 : GetWALAvailability(XLogRecPtr targetLSN)
7365 : {
7366 : XLogRecPtr currpos; /* current write LSN */
1097 alvherre 7367 ECB : XLogSegNo currSeg; /* segid of currpos */
7368 : XLogSegNo targetSeg; /* segid of targetLSN */
7369 : XLogSegNo oldestSeg; /* actual oldest segid */
7370 : XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
7371 : XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
7372 : uint64 keepSegs;
7373 :
7374 : /*
1019 7375 : * slot does not reserve WAL. Either deactivated, or has never been active
7376 : */
1097 alvherre 7377 GIC 305 : if (XLogRecPtrIsInvalid(targetLSN))
7378 11 : return WALAVAIL_INVALID_LSN;
7379 :
7380 : /*
7381 : * Calculate the oldest segment currently reserved by all slots,
7382 : * considering wal_keep_size and max_slot_wal_keep_size. Initialize
7383 : * oldestSlotSeg to the current segment.
7384 : */
1000 7385 294 : currpos = GetXLogWriteRecPtr();
7386 294 : XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
1097 7387 294 : KeepLogSeg(currpos, &oldestSlotSeg);
7388 :
7389 : /*
7390 : * Find the oldest extant segment file. We get 1 until checkpoint removes
1097 alvherre 7391 ECB : * the first WAL segment file since startup, which causes the status being
7392 : * wrong under certain abnormal conditions but that doesn't actually harm.
7393 : */
1097 alvherre 7394 GIC 294 : oldestSeg = XLogGetLastRemovedSegno() + 1;
1097 alvherre 7395 ECB :
1019 7396 : /* calculate oldest segment by max_wal_size */
1097 alvherre 7397 GIC 294 : XLByteToSeg(currpos, currSeg, wal_segment_size);
1019 7398 294 : keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
1097 alvherre 7399 ECB :
1097 alvherre 7400 CBC 294 : if (currSeg > keepSegs)
1097 alvherre 7401 GIC 8 : oldestSegMaxWalSize = currSeg - keepSegs;
1097 alvherre 7402 ECB : else
1097 alvherre 7403 CBC 286 : oldestSegMaxWalSize = 1;
7404 :
1000 alvherre 7405 ECB : /* the segment we care about */
1000 alvherre 7406 GIC 294 : XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
1000 alvherre 7407 ECB :
7408 : /*
7409 : * No point in returning reserved or extended status values if the
7410 : * targetSeg is known to be lost.
7411 : */
1019 alvherre 7412 GIC 294 : if (targetSeg >= oldestSlotSeg)
1097 alvherre 7413 ECB : {
1019 7414 : /* show "reserved" when targetSeg is within max_wal_size */
1019 alvherre 7415 GIC 293 : if (targetSeg >= oldestSegMaxWalSize)
1097 7416 291 : return WALAVAIL_RESERVED;
7417 :
7418 : /* being retained by slots exceeding max_wal_size */
1019 7419 2 : return WALAVAIL_EXTENDED;
1097 alvherre 7420 ECB : }
7421 :
7422 : /* WAL segments are no longer retained but haven't been removed yet */
1019 alvherre 7423 GIC 1 : if (targetSeg >= oldestSeg)
7424 1 : return WALAVAIL_UNRESERVED;
7425 :
1097 alvherre 7426 ECB : /* Definitely lost */
1097 alvherre 7427 LBC 0 : return WALAVAIL_REMOVED;
1097 alvherre 7428 ECB : }
7429 :
7430 :
7431 : /*
7432 : * Retreat *logSegNo to the last segment that we need to retain because of
7433 : * either wal_keep_size or replication slots.
7434 : *
7435 : * This is calculated by subtracting wal_keep_size from the given xlog
7436 : * location, recptr and by making sure that that result is below the
7437 : * requirement of replication slots. For the latter criterion we do consider
1097 alvherre 7438 EUB : * the effects of max_slot_wal_keep_size: reserve at most that much space back
7439 : * from recptr.
7440 : *
632 alvherre 7441 ECB : * Note about replication slots: if this function calculates a value
7442 : * that's further ahead than what slots need reserved, then affected
7443 : * slots need to be invalidated and this function invoked again.
7444 : * XXX it might be a good idea to rewrite this function so that
7445 : * invalidation is optionally done here, instead.
7446 : */
7447 : static void
3941 heikki.linnakangas 7448 GIC 2660 : KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
7449 : {
7450 : XLogSegNo currSegNo;
7451 : XLogSegNo segno;
7452 : XLogRecPtr keep;
7453 :
1097 alvherre 7454 2660 : XLByteToSeg(recptr, currSegNo, wal_segment_size);
1097 alvherre 7455 CBC 2660 : segno = currSegNo;
4282 simon 7456 EUB :
7457 : /*
1097 alvherre 7458 ECB : * Calculate how many segments are kept by slots first, adjusting for
7459 : * max_slot_wal_keep_size.
7460 : */
1097 alvherre 7461 GIC 2660 : keep = XLogGetReplicationSlotMinimumLSN();
7462 2660 : if (keep != InvalidXLogRecPtr)
7463 : {
1097 alvherre 7464 CBC 384 : XLByteToSeg(keep, segno, wal_segment_size);
7465 :
7466 : /* Cap by max_slot_wal_keep_size ... */
1097 alvherre 7467 GIC 384 : if (max_slot_wal_keep_size_mb >= 0)
7468 : {
7469 : uint64 slot_keep_segs;
7470 :
7471 17 : slot_keep_segs =
7472 17 : ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
3355 rhaas 7473 ECB :
1097 alvherre 7474 CBC 17 : if (currSegNo - segno > slot_keep_segs)
1097 alvherre 7475 GIC 4 : segno = currSegNo - slot_keep_segs;
7476 : }
1097 alvherre 7477 ECB : }
7478 :
7479 : /* but, keep at least wal_keep_size if that's set */
993 fujii 7480 CBC 2660 : if (wal_keep_size_mb > 0)
7481 : {
993 fujii 7482 ECB : uint64 keep_segs;
7483 :
993 fujii 7484 GIC 59 : keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
7485 59 : if (currSegNo - segno < keep_segs)
7486 : {
7487 : /* avoid underflow, don't go below 1 */
7488 59 : if (currSegNo <= keep_segs)
7489 57 : segno = 1;
7490 : else
7491 2 : segno = currSegNo - keep_segs;
993 fujii 7492 ECB : }
3355 rhaas 7493 EUB : }
7494 :
7495 : /* don't delete WAL segments newer than the calculated segment */
1000 alvherre 7496 GIC 2660 : if (segno < *logSegNo)
3941 heikki.linnakangas 7497 109 : *logSegNo = segno;
4282 simon 7498 CBC 2660 : }
7499 :
7500 : /*
7501 : * Write a NEXTOID log record
7502 : */
7503 : void
8192 vadim4o 7504 GIC 1248 : XLogPutNextOid(Oid nextOid)
7505 : {
3062 heikki.linnakangas 7506 1248 : XLogBeginInsert();
7507 1248 : XLogRegisterData((char *) (&nextOid), sizeof(Oid));
7508 1248 : (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
7509 :
7510 : /*
7511 : * We need not flush the NEXTOID record immediately, because any of the
7512 : * just-allocated OIDs could only reach disk as part of a tuple insert or
7513 : * update that would have its own XLOG record that must follow the NEXTOID
7514 : * record. Therefore, the standard buffer LSN interlock applied to those
7515 : * records will ensure no such OID reaches disk before the NEXTOID record
7516 : * does.
7517 : *
7518 : * Note, however, that the above statement only covers state "within" the
7519 : * database. When we use a generated OID as a file or directory name, we
7520 : * are in a sense violating the basic WAL rule, because that filesystem
7521 : * change may reach disk before the NEXTOID WAL record does. The impact
7522 : * of this is that if a database crash occurs immediately afterward, we
7523 : * might after restart re-generate the same OID and find that it conflicts
7524 : * with the leftover file or directory. But since for safety's sake we
5624 bruce 7525 ECB : * always loop until finding a nonconflicting filename, this poses no real
7526 : * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7527 : */
6555 tgl 7528 GIC 1248 : }
7529 :
7530 : /*
7531 : * Write an XLOG SWITCH record.
7532 : *
7533 : * Here we just blindly issue an XLogInsert request for the record.
7534 : * All the magic happens inside XLogInsert.
7535 : *
7536 : * The return value is either the end+1 address of the switch record,
7537 : * or the end+1 address of the prior segment if we did not need to
6090 tgl 7538 ECB : * write a switch record because we are already at segment start.
7539 : */
7540 : XLogRecPtr
2299 andres 7541 GIC 300 : RequestXLogSwitch(bool mark_unimportant)
7542 : {
7543 : XLogRecPtr RecPtr;
7544 :
7545 : /* XLOG SWITCH has no data */
3062 heikki.linnakangas 7546 CBC 300 : XLogBeginInsert();
2299 andres 7547 ECB :
2299 andres 7548 CBC 300 : if (mark_unimportant)
2299 andres 7549 UIC 0 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
3062 heikki.linnakangas 7550 GIC 300 : RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
7551 :
6090 tgl 7552 300 : return RecPtr;
7553 : }
7554 :
4443 simon 7555 ECB : /*
7556 : * Write a RESTORE POINT record
7557 : */
7558 : XLogRecPtr
4443 simon 7559 CBC 3 : XLogRestorePoint(const char *rpName)
7560 : {
4382 bruce 7561 ECB : XLogRecPtr RecPtr;
7562 : xl_restore_point xlrec;
7563 :
4443 simon 7564 CBC 3 : xlrec.rp_time = GetCurrentTimestamp();
3338 tgl 7565 GIC 3 : strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
7566 :
3062 heikki.linnakangas 7567 CBC 3 : XLogBeginInsert();
3062 heikki.linnakangas 7568 GIC 3 : XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
7569 :
7570 3 : RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
7571 :
4427 rhaas 7572 3 : ereport(LOG,
4427 rhaas 7573 ECB : (errmsg("restore point \"%s\" created at %X/%X",
7574 : rpName, LSN_FORMAT_ARGS(RecPtr))));
7575 :
4443 simon 7576 CBC 3 : return RecPtr;
4443 simon 7577 ECB : }
7578 :
7579 : /*
4729 heikki.linnakangas 7580 : * Check if any of the GUC parameters that are critical for hot standby
7581 : * have changed, and update the value in pg_control file if necessary.
7582 : */
7583 : static void
4729 heikki.linnakangas 7584 CBC 1142 : XLogReportParameters(void)
4827 heikki.linnakangas 7585 ECB : {
4729 heikki.linnakangas 7586 GIC 1142 : if (wal_level != ControlFile->wal_level ||
3385 rhaas 7587 982 : wal_log_hints != ControlFile->wal_log_hints ||
4729 heikki.linnakangas 7588 GBC 923 : MaxConnections != ControlFile->MaxConnections ||
3566 rhaas 7589 GIC 922 : max_worker_processes != ControlFile->max_worker_processes ||
1517 michael 7590 922 : max_wal_senders != ControlFile->max_wal_senders ||
4729 heikki.linnakangas 7591 918 : max_prepared_xacts != ControlFile->max_prepared_xacts ||
3049 alvherre 7592 838 : max_locks_per_xact != ControlFile->max_locks_per_xact ||
7593 838 : track_commit_timestamp != ControlFile->track_commit_timestamp)
7594 : {
7595 : /*
7596 : * The change in number of backend slots doesn't need to be WAL-logged
7597 : * if archiving is not enabled, as you can't start archive recovery
7598 : * with wal_level=minimal anyway. We don't really care about the
7599 : * values in pg_control either if wal_level=minimal, but seems better
7600 : * to keep them up-to-date to avoid confusion.
7601 : */
4729 heikki.linnakangas 7602 311 : if (wal_level != ControlFile->wal_level || XLogIsNeeded())
7603 : {
7604 : xl_parameter_change xlrec;
7605 : XLogRecPtr recptr;
7606 :
7607 309 : xlrec.MaxConnections = MaxConnections;
3566 rhaas 7608 309 : xlrec.max_worker_processes = max_worker_processes;
1517 michael 7609 CBC 309 : xlrec.max_wal_senders = max_wal_senders;
4729 heikki.linnakangas 7610 GIC 309 : xlrec.max_prepared_xacts = max_prepared_xacts;
7611 309 : xlrec.max_locks_per_xact = max_locks_per_xact;
7612 309 : xlrec.wal_level = wal_level;
3385 rhaas 7613 309 : xlrec.wal_log_hints = wal_log_hints;
3049 alvherre 7614 309 : xlrec.track_commit_timestamp = track_commit_timestamp;
4729 heikki.linnakangas 7615 ECB :
3062 heikki.linnakangas 7616 CBC 309 : XLogBeginInsert();
3062 heikki.linnakangas 7617 GIC 309 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
7618 :
7619 309 : recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
3301 fujii 7620 309 : XLogFlush(recptr);
7621 : }
4827 heikki.linnakangas 7622 ECB :
1035 tmunro 7623 CBC 311 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7624 :
4729 heikki.linnakangas 7625 311 : ControlFile->MaxConnections = MaxConnections;
3566 rhaas 7626 GIC 311 : ControlFile->max_worker_processes = max_worker_processes;
1517 michael 7627 311 : ControlFile->max_wal_senders = max_wal_senders;
4729 heikki.linnakangas 7628 CBC 311 : ControlFile->max_prepared_xacts = max_prepared_xacts;
4729 heikki.linnakangas 7629 GIC 311 : ControlFile->max_locks_per_xact = max_locks_per_xact;
7630 311 : ControlFile->wal_level = wal_level;
3385 rhaas 7631 311 : ControlFile->wal_log_hints = wal_log_hints;
3049 alvherre 7632 CBC 311 : ControlFile->track_commit_timestamp = track_commit_timestamp;
4729 heikki.linnakangas 7633 311 : UpdateControlFile();
7634 :
1035 tmunro 7635 311 : LWLockRelease(ControlFileLock);
4729 heikki.linnakangas 7636 ECB : }
4827 heikki.linnakangas 7637 GIC 1142 : }
7638 :
7639 : /*
7640 : * Update full_page_writes in shared memory, and write an
4092 simon 7641 ECB : * XLOG_FPW_CHANGE record if necessary.
7642 : *
7643 : * Note: this function assumes there is no other process running
7644 : * concurrently that could update it.
7645 : */
7646 : void
4092 simon 7647 GIC 1533 : UpdateFullPageWrites(void)
7648 : {
4092 simon 7649 CBC 1533 : XLogCtlInsert *Insert = &XLogCtl->Insert;
1654 akapila 7650 ECB : bool recoveryInProgress;
7651 :
4092 simon 7652 : /*
7653 : * Do nothing if full_page_writes has not been changed.
7654 : *
7655 : * It's safe to check the shared full_page_writes without the lock,
7656 : * because we assume that there is no concurrently running process which
3955 bruce 7657 : * can update it.
4092 simon 7658 : */
4092 simon 7659 CBC 1533 : if (fullPageWrites == Insert->fullPageWrites)
4092 simon 7660 GIC 1264 : return;
7661 :
7662 : /*
7663 : * Perform this outside critical section so that the WAL insert
7664 : * initialization done by RecoveryInProgress() doesn't trigger an
1654 akapila 7665 ECB : * assertion failure.
7666 : */
1654 akapila 7667 CBC 269 : recoveryInProgress = RecoveryInProgress();
1654 akapila 7668 ECB :
4051 heikki.linnakangas 7669 CBC 269 : START_CRIT_SECTION();
7670 :
7671 : /*
7672 : * It's always safe to take full page images, even when not strictly
7673 : * required, but not the other round. So if we're setting full_page_writes
7674 : * to true, first set it true and then write the WAL record. If we're
7675 : * setting it to false, first write the WAL record and then set the global
7676 : * flag.
7677 : */
4051 heikki.linnakangas 7678 GIC 269 : if (fullPageWrites)
7679 : {
3306 7680 267 : WALInsertLockAcquireExclusive();
4051 7681 267 : Insert->fullPageWrites = true;
3306 7682 267 : WALInsertLockRelease();
7683 : }
7684 :
7685 : /*
7686 : * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
7687 : * full_page_writes during archive recovery, if required.
7688 : */
1654 akapila 7689 CBC 269 : if (XLogStandbyInfoActive() && !recoveryInProgress)
7690 : {
3062 heikki.linnakangas 7691 UIC 0 : XLogBeginInsert();
7692 0 : XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
7693 :
7694 0 : XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
7695 : }
7696 :
4051 heikki.linnakangas 7697 GIC 269 : if (!fullPageWrites)
7698 : {
3306 7699 2 : WALInsertLockAcquireExclusive();
4051 7700 2 : Insert->fullPageWrites = false;
3306 7701 2 : WALInsertLockRelease();
4092 simon 7702 ECB : }
4051 heikki.linnakangas 7703 GIC 269 : END_CRIT_SECTION();
7704 : }
7705 :
7706 : /*
8062 tgl 7707 ECB : * XLOG resource manager's routines
7708 : *
5163 heikki.linnakangas 7709 : * Definitions of info values are in include/catalog/pg_control.h, though
5035 tgl 7710 EUB : * not all record types are related to control file updates.
417 heikki.linnakangas 7711 ECB : *
7712 : * NOTE: Some XLOG record types that are directly related to WAL recovery
7713 : * are handled in xlogrecovery_redo().
7714 : */
7715 : void
3062 heikki.linnakangas 7716 GIC 28161 : xlog_redo(XLogReaderState *record)
7717 : {
7718 28161 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
7719 28161 : XLogRecPtr lsn = record->EndRecPtr;
8192 vadim4o 7720 ECB :
7721 : /*
7722 : * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
7723 : * XLOG_FPI_FOR_HINT records.
7724 : */
3058 heikki.linnakangas 7725 CBC 28161 : Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
3058 heikki.linnakangas 7726 ECB : !XLogRecHasAnyBlockRefs(record));
7727 :
8057 tgl 7728 CBC 28161 : if (info == XLOG_NEXTOID)
8192 vadim4o 7729 ECB : {
7730 : Oid nextOid;
7731 :
7732 : /*
4080 tgl 7733 : * We used to try to take the maximum of ShmemVariableCache->nextOid
7734 : * and the recorded nextOid, but that fails if the OID counter wraps
7735 : * around. Since no OID allocation should be happening during replay
7736 : * anyway, better to just believe the record exactly. We still take
7737 : * OidGenLock while setting the variable, just in case.
7738 : */
8192 vadim4o 7739 GIC 72 : memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
4080 tgl 7740 72 : LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
7741 72 : ShmemVariableCache->nextOid = nextOid;
7742 72 : ShmemVariableCache->oidCount = 0;
7743 72 : LWLockRelease(OidGenLock);
7744 : }
8062 tgl 7745 CBC 28089 : else if (info == XLOG_CHECKPOINT_SHUTDOWN)
7746 : {
8062 tgl 7747 ECB : CheckPoint checkPoint;
417 heikki.linnakangas 7748 : TimeLineID replayTLI;
8062 tgl 7749 :
8062 tgl 7750 CBC 25 : memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8062 tgl 7751 ECB : /* In a SHUTDOWN checkpoint, believe the counters exactly */
4080 tgl 7752 CBC 25 : LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
971 andres 7753 25 : ShmemVariableCache->nextXid = checkPoint.nextXid;
4080 tgl 7754 25 : LWLockRelease(XidGenLock);
4080 tgl 7755 GIC 25 : LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8062 7756 25 : ShmemVariableCache->nextOid = checkPoint.nextOid;
7757 25 : ShmemVariableCache->oidCount = 0;
4080 7758 25 : LWLockRelease(OidGenLock);
6514 7759 25 : MultiXactSetNextMXact(checkPoint.nextMulti,
7760 : checkPoint.nextMultiOffset);
7761 :
2752 andres 7762 25 : MultiXactAdvanceOldest(checkPoint.oldestMulti,
2752 andres 7763 ECB : checkPoint.oldestMultiDB);
7764 :
7765 : /*
7766 : * No need to set oldestClogXid here as well; it'll be set when we
7767 : * redo an xl_clog_truncate if it changed since initialization.
2208 rhaas 7768 : */
4799 tgl 7769 CBC 25 : SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6797 bruce 7770 ECB :
4730 heikki.linnakangas 7771 : /*
4660 bruce 7772 : * If we see a shutdown checkpoint while waiting for an end-of-backup
4302 peter_e 7773 : * record, the backup was canceled and the end-of-backup record will
4660 bruce 7774 : * never arrive.
4730 heikki.linnakangas 7775 : */
3698 heikki.linnakangas 7776 GIC 25 : if (ArchiveRecoveryRequested &&
4092 simon 7777 CBC 25 : !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
4092 simon 7778 LBC 0 : XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
4080 tgl 7779 UIC 0 : ereport(PANIC,
2118 tgl 7780 ECB : (errmsg("online backup was canceled, recovery cannot continue")));
4730 heikki.linnakangas 7781 :
7782 : /*
7783 : * If we see a shutdown checkpoint, we know that nothing was running
1029 andres 7784 : * on the primary at this point. So fake-up an empty running-xacts
7785 : * record and use that here and now. Recover additional standby state
4660 bruce 7786 : * for prepared transactions.
4744 heikki.linnakangas 7787 : */
4859 simon 7788 CBC 25 : if (standbyState >= STANDBY_INITIALIZED)
4859 simon 7789 ECB : {
4744 heikki.linnakangas 7790 : TransactionId *xids;
7791 : int nxids;
7792 : TransactionId oldestActiveXID;
4714 simon 7793 : TransactionId latestCompletedXid;
4744 heikki.linnakangas 7794 : RunningTransactionsData running;
7795 :
4744 heikki.linnakangas 7796 CBC 23 : oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7797 :
4859 simon 7798 ECB : /*
7799 : * Construct a RunningTransactions snapshot representing a shut
7800 : * down server, with only prepared transactions still alive. We're
7801 : * never overflowed at this point because all subxids are listed
7802 : * with their parent prepared transactions.
7803 : */
4744 heikki.linnakangas 7804 GIC 23 : running.xcnt = nxids;
3780 simon 7805 23 : running.subxcnt = 0;
4744 heikki.linnakangas 7806 23 : running.subxid_overflow = false;
971 andres 7807 23 : running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
4744 heikki.linnakangas 7808 CBC 23 : running.oldestRunningXid = oldestActiveXID;
971 andres 7809 GIC 23 : latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
4714 simon 7810 CBC 23 : TransactionIdRetreat(latestCompletedXid);
4713 simon 7811 GIC 23 : Assert(TransactionIdIsNormal(latestCompletedXid));
4714 7812 23 : running.latestCompletedXid = latestCompletedXid;
4744 heikki.linnakangas 7813 23 : running.xids = xids;
7814 :
7815 23 : ProcArrayApplyRecoveryInfo(&running);
7816 :
2173 simon 7817 23 : StandbyRecoverPreparedTransactions();
7818 : }
7819 :
6075 tgl 7820 ECB : /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
1035 tmunro 7821 CBC 25 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
971 andres 7822 GIC 25 : ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
1035 tmunro 7823 25 : LWLockRelease(ControlFileLock);
7824 :
7825 : /* Update shared-memory copy of checkpoint XID/epoch */
3121 andres 7826 25 : SpinLockAcquire(&XLogCtl->info_lck);
971 7827 25 : XLogCtl->ckptFullXid = checkPoint.nextXid;
3121 andres 7828 CBC 25 : SpinLockRelease(&XLogCtl->info_lck);
7829 :
6836 tgl 7830 ECB : /*
7831 : * We should've already switched to the new TLI before replaying this
7832 : * record.
7833 : */
417 heikki.linnakangas 7834 GIC 25 : (void) GetCurrentReplayRecPtr(&replayTLI);
520 rhaas 7835 25 : if (checkPoint.ThisTimeLineID != replayTLI)
3775 heikki.linnakangas 7836 UIC 0 : ereport(PANIC,
7837 : (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
7838 : checkPoint.ThisTimeLineID, replayTLI)));
6089 tgl 7839 ECB :
501 rhaas 7840 GIC 25 : RecoveryRestartPoint(&checkPoint, record);
8062 tgl 7841 ECB : }
8062 tgl 7842 CBC 28064 : else if (info == XLOG_CHECKPOINT_ONLINE)
8062 tgl 7843 ECB : {
7844 : CheckPoint checkPoint;
7845 : TimeLineID replayTLI;
7846 :
8062 tgl 7847 GIC 141 : memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7848 : /* In an ONLINE checkpoint, treat the XID counter as a minimum */
4080 7849 141 : LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
971 andres 7850 CBC 141 : if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
7851 : checkPoint.nextXid))
971 andres 7852 UBC 0 : ShmemVariableCache->nextXid = checkPoint.nextXid;
4080 tgl 7853 GBC 141 : LWLockRelease(XidGenLock);
7854 :
1824 tgl 7855 EUB : /*
7856 : * We ignore the nextOid counter in an ONLINE checkpoint, preferring
7857 : * to track OID assignment through XLOG_NEXTOID records. The nextOid
1824 tgl 7858 ECB : * counter is from the start of the checkpoint and might well be stale
7859 : * compared to later XLOG_NEXTOID records. We could try to take the
7860 : * maximum of the nextOid counter and our latest value, but since
7861 : * there's no particular guarantee about the speed with which the OID
7862 : * counter wraps around, that's a risky thing to do. In any case,
7863 : * users of the nextOid counter are required to avoid assignment of
7864 : * duplicates, so that a somewhat out-of-date value should be safe.
7865 : */
7866 :
7867 : /* Handle multixact */
6514 tgl 7868 GIC 141 : MultiXactAdvanceNextMXact(checkPoint.nextMulti,
7869 : checkPoint.nextMultiOffset);
7870 :
7871 : /*
7872 : * NB: This may perform multixact truncation when replaying WAL
7873 : * generated by an older primary.
7874 : */
2752 andres 7875 141 : MultiXactAdvanceOldest(checkPoint.oldestMulti,
7876 : checkPoint.oldestMultiDB);
4969 tgl 7877 CBC 141 : if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
7878 : checkPoint.oldestXid))
4799 tgl 7879 LBC 0 : SetTransactionIdLimit(checkPoint.oldestXid,
4799 tgl 7880 ECB : checkPoint.oldestXidDB);
7881 : /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
1035 tmunro 7882 GIC 141 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
971 andres 7883 141 : ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
1035 tmunro 7884 141 : LWLockRelease(ControlFileLock);
7885 :
4028 simon 7886 ECB : /* Update shared-memory copy of checkpoint XID/epoch */
3121 andres 7887 GIC 141 : SpinLockAcquire(&XLogCtl->info_lck);
971 7888 141 : XLogCtl->ckptFullXid = checkPoint.nextXid;
3121 andres 7889 CBC 141 : SpinLockRelease(&XLogCtl->info_lck);
7890 :
7891 : /* TLI should not change in an on-line checkpoint */
417 heikki.linnakangas 7892 GIC 141 : (void) GetCurrentReplayRecPtr(&replayTLI);
520 rhaas 7893 141 : if (checkPoint.ThisTimeLineID != replayTLI)
6997 tgl 7894 UIC 0 : ereport(PANIC,
7895 : (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
7896 : checkPoint.ThisTimeLineID, replayTLI)));
7897 :
501 rhaas 7898 GIC 141 : RecoveryRestartPoint(&checkPoint, record);
7899 : }
557 alvherre 7900 CBC 27923 : else if (info == XLOG_OVERWRITE_CONTRECORD)
557 alvherre 7901 ECB : {
417 heikki.linnakangas 7902 : /* nothing to do here, handled in xlogrecovery_redo() */
557 alvherre 7903 : }
3722 simon 7904 CBC 27922 : else if (info == XLOG_END_OF_RECOVERY)
7905 : {
3722 simon 7906 ECB : xl_end_of_recovery xlrec;
7907 : TimeLineID replayTLI;
7908 :
3722 simon 7909 GIC 8 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
7910 :
3722 simon 7911 ECB : /*
7912 : * For Hot Standby, we could treat this like a Shutdown Checkpoint,
7913 : * but this case is rarer and harder to test, so the benefit doesn't
7914 : * outweigh the potential extra cost of maintenance.
7915 : */
7916 :
7917 : /*
7918 : * We should've already switched to the new TLI before replaying this
7919 : * record.
7920 : */
417 heikki.linnakangas 7921 GIC 8 : (void) GetCurrentReplayRecPtr(&replayTLI);
520 rhaas 7922 8 : if (xlrec.ThisTimeLineID != replayTLI)
3722 simon 7923 LBC 0 : ereport(PANIC,
7924 : (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
7925 : xlrec.ThisTimeLineID, replayTLI)));
7926 : }
5803 tgl 7927 GIC 27914 : else if (info == XLOG_NOOP)
7928 : {
7929 : /* nothing to do here */
5803 tgl 7930 ECB : }
6090 tgl 7931 GIC 27914 : else if (info == XLOG_SWITCH)
7932 : {
7933 : /* nothing to do here */
7934 : }
4443 simon 7935 27820 : else if (info == XLOG_RESTORE_POINT)
7936 : {
417 heikki.linnakangas 7937 ECB : /* nothing to do here, handled in xlogrecovery.c */
4443 simon 7938 : }
3058 heikki.linnakangas 7939 GBC 27815 : else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
3670 simon 7940 EUB : {
7941 : /*
7942 : * XLOG_FPI records contain nothing else but one or more block
7943 : * references. Every block reference must include a full-page image
7944 : * even if full_page_writes was disabled when the record was generated
7945 : * - otherwise there would be no point in this record.
7946 : *
7947 : * XLOG_FPI_FOR_HINT records are generated when a page needs to be
7948 : * WAL-logged because of a hint bit update. They are only generated
627 fujii 7949 ECB : * when checksums and/or wal_log_hints are enabled. They may include
7950 : * no full-page images if full_page_writes was disabled when they were
7951 : * generated. In this case there is nothing to do here.
7952 : *
7953 : * No recovery conflicts are generated by these generic records - if a
7954 : * resource manager needs to generate conflicts, it has to define a
7955 : * separate WAL record type and redo routine.
7956 : */
387 tmunro 7957 CBC 57791 : for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
7958 : {
7959 : Buffer buffer;
7960 :
627 fujii 7961 GIC 30056 : if (!XLogRecHasBlockImage(record, block_id))
7962 : {
7963 81 : if (info == XLOG_FPI)
627 fujii 7964 UIC 0 : elog(ERROR, "XLOG_FPI record did not contain a full-page image");
627 fujii 7965 CBC 81 : continue;
627 fujii 7966 ECB : }
7967 :
1467 heikki.linnakangas 7968 CBC 29975 : if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
1467 heikki.linnakangas 7969 LBC 0 : elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
1467 heikki.linnakangas 7970 CBC 29975 : UnlockReleaseBuffer(buffer);
1467 heikki.linnakangas 7971 ECB : }
3670 simon 7972 : }
4843 heikki.linnakangas 7973 CBC 80 : else if (info == XLOG_BACKUP_END)
4843 heikki.linnakangas 7974 ECB : {
7975 : /* nothing to do here, handled in xlogrecovery_redo() */
7976 : }
4729 heikki.linnakangas 7977 GIC 19 : else if (info == XLOG_PARAMETER_CHANGE)
4827 heikki.linnakangas 7978 ECB : {
7979 : xl_parameter_change xlrec;
7980 :
7981 : /* Update our copy of the parameters in pg_control */
4729 heikki.linnakangas 7982 CBC 19 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
4729 heikki.linnakangas 7983 ECB :
7984 : /*
7985 : * Invalidate logical slots if we are in hot standby and the primary
7986 : * does not have a WAL level sufficient for logical decoding. No need
7987 : * to search for potentially conflicting logically slots if standby is
7988 : * running with wal_level lower than logical, because in that case, we
7989 : * would have either disallowed creation of logical slots or
7990 : * invalidated existing ones.
7991 : */
2 andres 7992 GNC 19 : if (InRecovery && InHotStandby &&
7993 4 : xlrec.wal_level < WAL_LEVEL_LOGICAL &&
7994 3 : wal_level >= WAL_LEVEL_LOGICAL)
7995 1 : InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
7996 : 0, InvalidOid,
7997 : InvalidTransactionId);
7998 :
4724 heikki.linnakangas 7999 CBC 19 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4729 heikki.linnakangas 8000 GIC 19 : ControlFile->MaxConnections = xlrec.MaxConnections;
3566 rhaas 8001 19 : ControlFile->max_worker_processes = xlrec.max_worker_processes;
1517 michael 8002 CBC 19 : ControlFile->max_wal_senders = xlrec.max_wal_senders;
4729 heikki.linnakangas 8003 19 : ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8004 19 : ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
4729 heikki.linnakangas 8005 GIC 19 : ControlFile->wal_level = xlrec.wal_level;
3006 8006 19 : ControlFile->wal_log_hints = xlrec.wal_log_hints;
8007 :
8008 : /*
8009 : * Update minRecoveryPoint to ensure that if recovery is aborted, we
4660 bruce 8010 ECB : * recover back up to this point before allowing hot standby again.
2596 peter_e 8011 : * This is important if the max_* settings are decreased, to ensure
1739 michael 8012 EUB : * you don't run queries against the WAL preceding the change. The
8013 : * local copies cannot be updated as long as crash recovery is
8014 : * happening and we expect all the WAL to be replayed.
8015 : */
1739 michael 8016 CBC 19 : if (InArchiveRecovery)
8017 : {
417 heikki.linnakangas 8018 5 : LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
417 heikki.linnakangas 8019 GIC 5 : LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8020 : }
8021 19 : if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
8022 : {
417 heikki.linnakangas 8023 ECB : TimeLineID replayTLI;
8024 :
417 heikki.linnakangas 8025 CBC 5 : (void) GetCurrentReplayRecPtr(&replayTLI);
4724 8026 5 : ControlFile->minRecoveryPoint = lsn;
520 rhaas 8027 GIC 5 : ControlFile->minRecoveryPointTLI = replayTLI;
4724 heikki.linnakangas 8028 EUB : }
4724 heikki.linnakangas 8029 ECB :
2747 alvherre 8030 GIC 19 : CommitTsParameterChange(xlrec.track_commit_timestamp,
8031 19 : ControlFile->track_commit_timestamp);
8032 19 : ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
8033 :
4729 heikki.linnakangas 8034 19 : UpdateControlFile();
4724 8035 19 : LWLockRelease(ControlFileLock);
8036 :
8037 : /* Check to see if any parameter change gives a problem on recovery */
4729 8038 19 : CheckRequiredParameterValues();
8039 : }
4092 simon 8040 UIC 0 : else if (info == XLOG_FPW_CHANGE)
8041 : {
8042 : bool fpw;
8043 :
4092 simon 8044 LBC 0 : memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8045 :
8046 : /*
8047 : * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8048 : * do_pg_backup_start() and do_pg_backup_stop() can check whether
8049 : * full_page_writes has been disabled during online backup.
8050 : */
8051 0 : if (!fpw)
8052 : {
3121 andres 8053 0 : SpinLockAcquire(&XLogCtl->info_lck);
501 rhaas 8054 UIC 0 : if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
501 rhaas 8055 UBC 0 : XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
3121 andres 8056 UIC 0 : SpinLockRelease(&XLogCtl->info_lck);
8057 : }
4092 simon 8058 ECB :
8059 : /* Keep track of full_page_writes */
4092 simon 8060 LBC 0 : lastFullPageWrites = fpw;
8061 : }
8205 vadim4o 8062 GIC 28159 : }
8053 bruce 8063 ECB :
8059 tgl 8064 : /*
8065 : * Return the extra open flags used for opening a file, depending on the
8066 : * value of the GUCs wal_sync_method, fsync and io_direct.
8067 : */
5443 magnus 8068 : static int
5443 magnus 8069 CBC 8348 : get_sync_bit(int method)
8059 tgl 8070 EUB : {
4790 bruce 8071 GIC 8348 : int o_direct_flag = 0;
8072 :
8073 : /*
8074 : * Use O_DIRECT if requested, except in walreceiver process. The WAL
8075 : * written by walreceiver is normally read by the startup process soon
8076 : * after it's written. Also, walreceiver performs unaligned writes, which
8077 : * don't work with O_DIRECT, so it is required for correctness too.
8078 : */
1 tmunro 8079 GNC 8348 : if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
4797 heikki.linnakangas 8080 GIC 7 : o_direct_flag = PG_O_DIRECT;
8081 :
8082 : /* If fsync is disabled, never open in sync mode */
1 tmunro 8083 GNC 8348 : if (!enableFsync)
8084 8348 : return o_direct_flag;
8085 :
5443 magnus 8086 UIC 0 : switch (method)
8087 : {
8088 : /*
5050 bruce 8089 ECB : * enum values for all sync options are defined even if they are
8090 : * not supported on the current platform. But if not, they are
5050 bruce 8091 EUB : * not included in the enum option array, and therefore will never
8092 : * be seen here.
8093 : */
5445 magnus 8094 UIC 0 : case SYNC_METHOD_FSYNC:
5445 magnus 8095 ECB : case SYNC_METHOD_FSYNC_WRITETHROUGH:
8096 : case SYNC_METHOD_FDATASYNC:
1 tmunro 8097 UNC 0 : return o_direct_flag;
8098 : #ifdef O_SYNC
5445 magnus 8099 LBC 0 : case SYNC_METHOD_OPEN:
261 tmunro 8100 UNC 0 : return O_SYNC | o_direct_flag;
8101 : #endif
8102 : #ifdef O_DSYNC
5445 magnus 8103 LBC 0 : case SYNC_METHOD_OPEN_DSYNC:
261 tmunro 8104 UNC 0 : return O_DSYNC | o_direct_flag;
8105 : #endif
5445 magnus 8106 UIC 0 : default:
5441 tgl 8107 ECB : /* can't happen (unless we are out of sync with option array) */
5441 tgl 8108 UIC 0 : elog(ERROR, "unrecognized wal_sync_method: %d", method);
8109 : return 0; /* silence warning */
8110 : }
8111 : }
8112 :
8113 : /*
8114 : * GUC support
8115 : */
8116 : void
4385 tgl 8117 GIC 1857 : assign_xlog_sync_method(int new_sync_method, void *extra)
8118 : {
5443 magnus 8119 1857 : if (sync_method != new_sync_method)
8120 : {
8121 : /*
8122 : * To ensure that no blocks escape unsynced, force an fsync on the
8123 : * currently open log segment (if any). Also, if the open flag is
8124 : * changing, close the log file so it will be reopened (with new flag
6385 bruce 8125 ECB : * bit) at next use.
8126 : */
8059 tgl 8127 UIC 0 : if (openLogFile >= 0)
8128 : {
2213 rhaas 8129 LBC 0 : pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
8059 tgl 8130 UIC 0 : if (pg_fsync(openLogFile) != 0)
1223 michael 8131 ECB : {
1223 michael 8132 EUB : char xlogfname[MAXFNAMELEN];
1223 michael 8133 ECB : int save_errno;
8134 :
1223 michael 8135 UIC 0 : save_errno = errno;
520 rhaas 8136 LBC 0 : XLogFileName(xlogfname, openLogTLI, openLogSegNo,
1223 michael 8137 EUB : wal_segment_size);
1223 michael 8138 LBC 0 : errno = save_errno;
7202 tgl 8139 UIC 0 : ereport(PANIC,
8140 : (errcode_for_file_access(),
1223 michael 8141 ECB : errmsg("could not fsync file \"%s\": %m", xlogfname)));
8142 : }
8143 :
2213 rhaas 8144 UIC 0 : pgstat_report_wait_end();
5441 tgl 8145 LBC 0 : if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
6142 bruce 8146 UIC 0 : XLogFileClose();
8147 : }
8148 : }
8059 tgl 8149 GIC 1857 : }
8059 tgl 8150 ECB :
8151 :
8152 : /*
8153 : * Issue appropriate kind of fsync (if any) for an XLOG output file.
8154 : *
8155 : * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8156 : * 'segno' is for error reporting purposes.
8157 : */
8158 : void
520 rhaas 8159 GIC 315883 : issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
8059 tgl 8160 ECB : {
1223 michael 8161 CBC 315883 : char *msg = NULL;
761 fujii 8162 ECB : instr_time start;
8163 :
520 rhaas 8164 GIC 315883 : Assert(tli != 0);
8165 :
8166 : /*
761 fujii 8167 ECB : * Quick exit if fsync is disabled or write() has already synced the WAL
8168 : * file.
8169 : */
761 fujii 8170 CBC 315883 : if (!enableFsync ||
761 fujii 8171 LBC 0 : sync_method == SYNC_METHOD_OPEN ||
8172 0 : sync_method == SYNC_METHOD_OPEN_DSYNC)
761 fujii 8173 CBC 315883 : return;
761 fujii 8174 ECB :
8175 : /* Measure I/O timing to sync the WAL file */
761 fujii 8176 UIC 0 : if (track_wal_io_timing)
8177 0 : INSTR_TIME_SET_CURRENT(start);
8178 : else
79 andres 8179 UNC 0 : INSTR_TIME_SET_ZERO(start);
8180 :
1742 michael 8181 UIC 0 : pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
8059 tgl 8182 0 : switch (sync_method)
8183 : {
7836 bruce 8184 0 : case SYNC_METHOD_FSYNC:
4832 heikki.linnakangas 8185 0 : if (pg_fsync_no_writethrough(fd) != 0)
1223 michael 8186 LBC 0 : msg = _("could not fsync file \"%s\": %m");
8059 tgl 8187 UIC 0 : break;
6533 bruce 8188 ECB : #ifdef HAVE_FSYNC_WRITETHROUGH
8189 : case SYNC_METHOD_FSYNC_WRITETHROUGH:
8190 : if (pg_fsync_writethrough(fd) != 0)
1223 michael 8191 : msg = _("could not fsync write-through file \"%s\": %m");
8192 : break;
8193 : #endif
8059 tgl 8194 LBC 0 : case SYNC_METHOD_FDATASYNC:
4832 heikki.linnakangas 8195 0 : if (pg_fdatasync(fd) != 0)
1223 michael 8196 0 : msg = _("could not fdatasync file \"%s\": %m");
8059 tgl 8197 UIC 0 : break;
8059 tgl 8198 LBC 0 : case SYNC_METHOD_OPEN:
5445 magnus 8199 ECB : case SYNC_METHOD_OPEN_DSYNC:
761 fujii 8200 : /* not reachable */
761 fujii 8201 UIC 0 : Assert(false);
8059 tgl 8202 ECB : break;
8059 tgl 8203 LBC 0 : default:
7202 tgl 8204 UIC 0 : elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8205 : break;
8059 tgl 8206 ECB : }
8207 :
1223 michael 8208 EUB : /* PANIC if failed to fsync */
1223 michael 8209 UIC 0 : if (msg)
8210 : {
8211 : char xlogfname[MAXFNAMELEN];
1223 michael 8212 UBC 0 : int save_errno = errno;
8213 :
520 rhaas 8214 UIC 0 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
1223 michael 8215 0 : errno = save_errno;
8216 0 : ereport(PANIC,
8217 : (errcode_for_file_access(),
8218 : errmsg(msg, xlogfname)));
1223 michael 8219 EUB : }
8220 :
1223 michael 8221 UBC 0 : pgstat_report_wait_end();
761 fujii 8222 EUB :
8223 : /*
8224 : * Increment the I/O timing and the number of times WAL files were synced.
8225 : */
761 fujii 8226 UIC 0 : if (track_wal_io_timing)
8227 : {
761 fujii 8228 EUB : instr_time duration;
8229 :
761 fujii 8230 LBC 0 : INSTR_TIME_SET_CURRENT(duration);
10 andres 8231 UNC 0 : INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, duration, start);
8232 : }
8233 :
368 andres 8234 UIC 0 : PendingWalStats.wal_sync++;
8235 : }
3941 heikki.linnakangas 8236 ECB :
8237 : /*
368 sfrost 8238 : * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
8239 : * function. It creates the necessary starting checkpoint and constructs the
8240 : * backup state and tablespace map.
8241 : *
8242 : * Input parameters are "state" (the backup state), "fast" (if true, we do
8243 : * the checkpoint in immediate mode to make it faster), and "tablespaces"
8244 : * (if non-NULL, indicates a list of tablespaceinfo structs describing the
8245 : * cluster's tablespaces.).
8246 : *
8247 : * The tablespace map contents are appended to passed-in parameter
8248 : * tablespace_map and the caller is responsible for including it in the backup
8249 : * archive as 'tablespace_map'. The tablespace_map file is required mainly for
8250 : * tar format in windows as native windows utilities are not able to create
8251 : * symlinks while extracting files from tar. However for consistency and
8252 : * platform-independence, we do it the same way everywhere.
8253 : *
8254 : * It fills in "state" with the information required for the backup, such
8255 : * as the minimum WAL location that must be present to restore from this
8256 : * backup (starttli) and the corresponding timeline ID (starttli).
8257 : *
8258 : * Every successfully started backup must be stopped by calling
8259 : * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
363 tgl 8260 EUB : * backups active at the same time.
8261 : *
8262 : * It is the responsibility of the caller of this function to verify the
3379 magnus 8263 : * permissions of the calling user!
8264 : */
8265 : void
195 michael 8266 GNC 130 : do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
8267 : BackupState *state, StringInfo tblspcmapfile)
4473 magnus 8268 EUB : {
8269 : bool backup_started_in_recovery;
8270 :
195 michael 8271 GNC 130 : Assert(state != NULL);
4092 simon 8272 GIC 130 : backup_started_in_recovery = RecoveryInProgress();
8273 :
8274 : /*
8275 : * During recovery, we don't need to check WAL level. Because, if WAL
3955 bruce 8276 ECB : * level is not sufficient, it's impossible to get here during recovery.
8277 : */
4092 simon 8278 CBC 130 : if (!backup_started_in_recovery && !XLogIsNeeded())
5674 tgl 8279 UIC 0 : ereport(ERROR,
8280 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8281 : errmsg("WAL level not sufficient for making an online backup"),
8282 : errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
8283 :
4451 heikki.linnakangas 8284 GIC 130 : if (strlen(backupidstr) > MAXPGPATH)
8285 1 : ereport(ERROR,
4451 heikki.linnakangas 8286 EUB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8287 : errmsg("backup label too long (max %d bytes)",
8288 : MAXPGPATH)));
8289 :
195 michael 8290 GNC 129 : memcpy(state->name, backupidstr, strlen(backupidstr));
8291 :
8292 : /*
8293 : * Mark backup active in shared memory. We must do full-page WAL writes
8294 : * during an on-line backup even if not doing so at other times, because
8295 : * it's quite possible for the backup dump to obtain a "torn" (partially
6031 bruce 8296 EUB : * written) copy of a database page if it reads the page concurrently with
3260 8297 : * our write to the same page. This can be fixed as long as the first
8298 : * write to the page in the WAL sequence is a full-page write. Hence, we
8299 : * increment runningBackups then force a CHECKPOINT, to ensure there are
8300 : * no dirty pages in shared memory that might get dumped while the backup
8301 : * is in progress without having a corresponding WAL record. (Once the
8302 : * backup is complete, we need not force full-page writes anymore, since
8303 : * we expect that any pages not modified during the backup interval must
8304 : * have been correctly captured by the backup.)
6201 tgl 8305 : *
8306 : * Note that forcing full-page writes has no effect during an online
8307 : * backup from the standby.
8308 : *
8309 : * We must hold all the insertion locks to change the value of
8310 : * runningBackups, to ensure adequate interlocking against
8311 : * XLogInsertRecord().
8312 : */
3306 heikki.linnakangas 8313 GIC 129 : WALInsertLockAcquireExclusive();
368 sfrost 8314 129 : XLogCtl->Insert.runningBackups++;
3306 heikki.linnakangas 8315 129 : WALInsertLockRelease();
8316 :
8317 : /*
8318 : * Ensure we decrement runningBackups if we fail below. NB -- for this to
8319 : * work correctly, it is critical that sessionBackupState is only updated
8320 : * after this block is over.
8321 : */
172 alvherre 8322 GNC 129 : PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
6201 tgl 8323 ECB : {
4382 bruce 8324 GIC 129 : bool gotUniqueStartpoint = false;
1952 tgl 8325 ECB : DIR *tblspcdir;
8326 : struct dirent *de;
8327 : tablespaceinfo *ti;
2889 andrew 8328 : int datadirpathlen;
8329 :
8330 : /*
8331 : * Force an XLOG file switch before the checkpoint, to ensure that the
8332 : * WAL segment the checkpoint is written to doesn't contain pages with
8333 : * old timeline IDs. That would otherwise happen if you called
368 sfrost 8334 : * pg_backup_start() right after restoring from a PITR archive: the
4136 tgl 8335 EUB : * first WAL segment containing the startup checkpoint has pages in
3260 bruce 8336 : * the beginning with the old timeline ID. That can cause trouble at
4136 tgl 8337 ECB : * recovery: we won't have a history file covering the old timeline if
8338 : * pg_wal directory was not included in the base backup and the WAL
8339 : * archive was cleared too before starting the backup.
4136 tgl 8340 EUB : *
8341 : * This also ensures that we have emitted a WAL page header that has
8342 : * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
8343 : * Therefore, if a WAL archiver (such as pglesslog) is trying to
8344 : * compress out removable backup blocks, it won't remove any that
8345 : * occur after this point.
4092 simon 8346 : *
8347 : * During recovery, we skip forcing XLOG file switch, which means that
8348 : * the backup taken during recovery is not available for the special
8349 : * recovery case described above.
4136 tgl 8350 : */
4092 simon 8351 GBC 129 : if (!backup_started_in_recovery)
2299 andres 8352 GIC 124 : RequestXLogSwitch(false);
8353 :
8354 : do
8355 : {
8356 : bool checkpointfpw;
8357 :
4402 heikki.linnakangas 8358 EUB : /*
3260 bruce 8359 : * Force a CHECKPOINT. Aside from being necessary to prevent torn
4382 8360 : * page problems, this guarantees that two successive backup runs
8361 : * will have different checkpoint positions and hence different
8362 : * history file names, even if nothing happened in between.
8363 : *
8364 : * During recovery, establish a restartpoint if possible. We use
3955 8365 : * the last restartpoint as the backup starting checkpoint. This
8366 : * means that two successive backup runs can have same checkpoint
8367 : * positions.
4092 simon 8368 : *
8369 : * Since the fact that we are executing do_pg_backup_start()
8370 : * during recovery means that checkpointer is running, we can use
8371 : * RequestCheckpoint() to establish a restartpoint.
8372 : *
4382 bruce 8373 : * We use CHECKPOINT_IMMEDIATE only if requested by user (via
8374 : * passing fast = true). Otherwise this can take awhile.
8375 : */
4402 heikki.linnakangas 8376 GBC 129 : RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8377 : (fast ? CHECKPOINT_IMMEDIATE : 0));
6822 tgl 8378 EUB :
4402 heikki.linnakangas 8379 : /*
4382 bruce 8380 : * Now we need to fetch the checkpoint record location, and also
8381 : * its REDO pointer. The oldest point in WAL that would be needed
8382 : * to restore starting from the checkpoint is precisely the REDO
8383 : * pointer.
8384 : */
4402 heikki.linnakangas 8385 GBC 129 : LWLockAcquire(ControlFileLock, LW_SHARED);
195 michael 8386 GNC 129 : state->checkpointloc = ControlFile->checkPoint;
8387 129 : state->startpoint = ControlFile->checkPointCopy.redo;
8388 129 : state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
4092 simon 8389 GIC 129 : checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
4402 heikki.linnakangas 8390 GBC 129 : LWLockRelease(ControlFileLock);
8391 :
4092 simon 8392 GIC 129 : if (backup_started_in_recovery)
8393 : {
3955 bruce 8394 EUB : XLogRecPtr recptr;
4092 simon 8395 :
8396 : /*
8397 : * Check to see if all WAL replayed during online backup
3955 bruce 8398 : * (i.e., since last restartpoint used as backup starting
8399 : * checkpoint) contain full-page writes.
8400 : */
3121 andres 8401 GIC 5 : SpinLockAcquire(&XLogCtl->info_lck);
8402 5 : recptr = XLogCtl->lastFpwDisableRecPtr;
8403 5 : SpinLockRelease(&XLogCtl->info_lck);
8404 :
195 michael 8405 GNC 5 : if (!checkpointfpw || state->startpoint <= recptr)
4092 simon 8406 UIC 0 : ereport(ERROR,
8407 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8408 : errmsg("WAL generated with full_page_writes=off was replayed "
8409 : "since last restartpoint"),
8410 : errhint("This means that the backup being taken on the standby "
8411 : "is corrupt and should not be used. "
8412 : "Enable full_page_writes and run CHECKPOINT on the primary, "
8413 : "and then try an online backup again.")));
8414 :
8415 : /*
8416 : * During recovery, since we don't use the end-of-backup WAL
8417 : * record and don't write the backup history file, the
8418 : * starting WAL location doesn't need to be unique. This means
8419 : * that two base backups started at the same time might use
8420 : * the same checkpoint as starting locations.
8421 : */
4092 simon 8422 GIC 5 : gotUniqueStartpoint = true;
8423 : }
8424 :
8425 : /*
8426 : * If two base backups are started at the same time (in WAL sender
8427 : * processes), we need to make sure that they use different
8428 : * checkpoints as starting locations, because we use the starting
8429 : * WAL location as a unique identifier for the base backup in the
4382 bruce 8430 ECB : * end-of-backup WAL record and when we write the backup history
8431 : * file. Perhaps it would be better generate a separate unique ID
8432 : * for each backup instead of forcing another checkpoint, but
8433 : * taking a checkpoint right after another is not that expensive
8434 : * either because only few buffers have been dirtied yet.
4402 heikki.linnakangas 8435 : */
3306 heikki.linnakangas 8436 CBC 129 : WALInsertLockAcquireExclusive();
195 michael 8437 GNC 129 : if (XLogCtl->Insert.lastBackupStart < state->startpoint)
8438 : {
8439 129 : XLogCtl->Insert.lastBackupStart = state->startpoint;
4402 heikki.linnakangas 8440 GIC 129 : gotUniqueStartpoint = true;
8441 : }
3306 heikki.linnakangas 8442 CBC 129 : WALInsertLockRelease();
4382 bruce 8443 GBC 129 : } while (!gotUniqueStartpoint);
8444 :
2889 andrew 8445 ECB : /*
363 tgl 8446 : * Construct tablespace_map file.
8447 : */
2889 andrew 8448 GIC 129 : datadirpathlen = strlen(DataDir);
8449 :
8450 : /* Collect information about all tablespaces */
1952 tgl 8451 CBC 129 : tblspcdir = AllocateDir("pg_tblspc");
2889 andrew 8452 GIC 414 : while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
8453 : {
8454 : char fullpath[MAXPGPATH + 10];
8455 : char linkpath[MAXPGPATH];
8456 285 : char *relpath = NULL;
8457 : int rllen;
8458 : StringInfoData escapedpath;
8459 : char *s;
8460 :
8461 : /* Skip anything that doesn't look like a tablespace */
753 tgl 8462 285 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2889 andrew 8463 269 : continue;
8464 :
8465 27 : snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
8466 :
8467 : /*
8468 : * Skip anything that isn't a symlink/junction. For testing only,
8469 : * we sometimes use allow_in_place_tablespaces to create
8470 : * directories directly under pg_tblspc, which would fail below.
8471 : */
390 tmunro 8472 27 : if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
8473 11 : continue;
390 tmunro 8474 ECB :
2889 andrew 8475 CBC 16 : rllen = readlink(fullpath, linkpath, sizeof(linkpath));
2889 andrew 8476 GIC 16 : if (rllen < 0)
8477 : {
2889 andrew 8478 UIC 0 : ereport(WARNING,
8479 : (errmsg("could not read symbolic link \"%s\": %m",
8480 : fullpath)));
8481 0 : continue;
2889 andrew 8482 ECB : }
2889 andrew 8483 GIC 16 : else if (rllen >= sizeof(linkpath))
2889 andrew 8484 ECB : {
2889 andrew 8485 UIC 0 : ereport(WARNING,
8486 : (errmsg("symbolic link \"%s\" target is too long",
8487 : fullpath)));
8488 0 : continue;
8489 : }
2889 andrew 8490 GIC 16 : linkpath[rllen] = '\0';
8491 :
8492 : /*
8493 : * Build a backslash-escaped version of the link path to include
8494 : * in the tablespace map file.
8495 : */
753 tgl 8496 16 : initStringInfo(&escapedpath);
8497 444 : for (s = linkpath; *s; s++)
8498 : {
8499 428 : if (*s == '\n' || *s == '\r' || *s == '\\')
753 tgl 8500 UIC 0 : appendStringInfoChar(&escapedpath, '\\');
753 tgl 8501 GIC 428 : appendStringInfoChar(&escapedpath, *s);
8502 : }
8503 :
8504 : /*
8505 : * Relpath holds the relative path of the tablespace directory
8506 : * when it's located within PGDATA, or NULL if it's located
8507 : * elsewhere.
8508 : */
2889 andrew 8509 16 : if (rllen > datadirpathlen &&
8510 1 : strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
2889 andrew 8511 LBC 0 : IS_DIR_SEP(linkpath[datadirpathlen]))
8512 0 : relpath = linkpath + datadirpathlen + 1;
8513 :
2889 andrew 8514 GIC 16 : ti = palloc(sizeof(tablespaceinfo));
8515 16 : ti->oid = pstrdup(de->d_name);
753 tgl 8516 16 : ti->path = pstrdup(linkpath);
2889 andrew 8517 16 : ti->rpath = relpath ? pstrdup(relpath) : NULL;
1026 rhaas 8518 16 : ti->size = -1;
8519 :
2878 bruce 8520 16 : if (tablespaces)
8521 16 : *tablespaces = lappend(*tablespaces, ti);
8522 :
753 tgl 8523 16 : appendStringInfo(tblspcmapfile, "%s %s\n",
8524 : ti->oid, escapedpath.data);
8525 :
8526 16 : pfree(escapedpath.data);
8527 : }
1952 8528 129 : FreeDir(tblspcdir);
8529 :
195 michael 8530 GNC 129 : state->starttime = (pg_time_t) time(NULL);
8531 : }
172 alvherre 8532 129 : PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
8533 :
195 michael 8534 129 : state->started_in_recovery = backup_started_in_recovery;
195 michael 8535 ECB :
2273 fujii 8536 : /*
8537 : * Mark that the start phase has correctly finished for the backup.
8538 : */
368 sfrost 8539 GBC 129 : sessionBackupState = SESSION_BACKUP_RUNNING;
6823 tgl 8540 GIC 129 : }
8541 :
8542 : /*
8543 : * Utility routine to fetch the session-level status of a backup running.
8544 : */
2207 teodor 8545 ECB : SessionBackupState
2207 teodor 8546 CBC 149 : get_backup_status(void)
8547 : {
8548 149 : return sessionBackupState;
2207 teodor 8549 ECB : }
8550 :
4451 heikki.linnakangas 8551 : /*
368 sfrost 8552 : * do_pg_backup_stop
8553 : *
8554 : * Utility function called at the end of an online backup. It creates history
8555 : * file (if required), resets sessionBackupState and so on. It can optionally
8556 : * wait for WAL segments to be archived.
8557 : *
8558 : * "state" is filled with the information necessary to restore from this
8559 : * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
8560 : *
3379 magnus 8561 : * It is the responsibility of the caller of this function to verify the
8562 : * permissions of the calling user!
8563 : */
8564 : void
195 michael 8565 GNC 122 : do_pg_backup_stop(BackupState *state, bool waitforarchive)
6823 tgl 8566 ECB : {
195 michael 8567 GNC 122 : bool backup_stopped_in_recovery = false;
6822 tgl 8568 ECB : char histfilepath[MAXPGPATH];
8569 : char lastxlogfilename[MAXFNAMELEN];
8570 : char histfilename[MAXFNAMELEN];
8571 : XLogSegNo _logSegNo;
8572 : FILE *fp;
5482 bruce 8573 : int seconds_before_warning;
5482 bruce 8574 CBC 122 : int waits = 0;
4739 simon 8575 GIC 122 : bool reported_waiting = false;
8576 :
195 michael 8577 GNC 122 : Assert(state != NULL);
8578 :
8579 122 : backup_stopped_in_recovery = RecoveryInProgress();
6823 tgl 8580 EUB :
8581 : /*
3955 bruce 8582 ECB : * During recovery, we don't need to check WAL level. Because, if WAL
8583 : * level is not sufficient, it's impossible to get here during recovery.
4092 simon 8584 EUB : */
195 michael 8585 GNC 122 : if (!backup_stopped_in_recovery && !XLogIsNeeded())
5326 tgl 8586 UIC 0 : ereport(ERROR,
5326 tgl 8587 EUB : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8588 : errmsg("WAL level not sufficient for making an online backup"),
2596 peter_e 8589 ECB : errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
8590 :
8591 : /*
8592 : * OK to update backup counter and session-level lock.
8593 : *
8594 : * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
8595 : * otherwise they can be updated inconsistently, which might cause
1937 fujii 8596 : * do_pg_abort_backup() to fail.
8597 : */
2273 fujii 8598 CBC 122 : WALInsertLockAcquireExclusive();
4451 heikki.linnakangas 8599 EUB :
368 sfrost 8600 ECB : /*
8601 : * It is expected that each do_pg_backup_start() call is matched by
8602 : * exactly one do_pg_backup_stop() call.
8603 : */
368 sfrost 8604 GIC 122 : Assert(XLogCtl->Insert.runningBackups > 0);
8605 122 : XLogCtl->Insert.runningBackups--;
8606 :
8607 : /*
1937 fujii 8608 ECB : * Clean up session-level lock.
8609 : *
1809 tgl 8610 : * You might think that WALInsertLockRelease() can be called before
8611 : * cleaning up session-level lock because session-level lock doesn't need
8612 : * to be protected with WAL insertion lock. But since
8613 : * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
8614 : * cleaned up before it.
1937 fujii 8615 : */
2207 teodor 8616 GIC 122 : sessionBackupState = SESSION_BACKUP_NONE;
2207 teodor 8617 ECB :
1937 fujii 8618 GIC 122 : WALInsertLockRelease();
8619 :
6823 tgl 8620 ECB : /*
8621 : * If we are taking an online backup from the standby, we confirm that the
8622 : * standby has not been promoted during the backup.
8623 : */
195 michael 8624 GNC 122 : if (state->started_in_recovery && !backup_stopped_in_recovery)
4092 simon 8625 UIC 0 : ereport(ERROR,
8626 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8627 : errmsg("the standby was promoted during online backup"),
8628 : errhint("This means that the backup being taken is corrupt "
8629 : "and should not be used. "
8630 : "Try taking another online backup.")));
8631 :
8632 : /*
8633 : * During recovery, we don't write an end-of-backup record. We assume that
8634 : * pg_control was backed up last and its minimum recovery point can be
8635 : * available as the backup end location. Since we don't have an
8636 : * end-of-backup record, we use the pg_control value to check whether
8637 : * we've reached the end of backup when starting recovery from this
8638 : * backup. We have no way of checking if pg_control wasn't backed up last
8639 : * however.
4092 simon 8640 ECB : *
8641 : * We don't force a switch to new WAL file but it is still possible to
2073 rhaas 8642 : * wait for all the required files to be archived if waitforarchive is
8643 : * true. This is okay if we use the backup to start a standby and fetch
8644 : * the missing WAL using streaming replication. But in the case of an
8645 : * archive recovery, a user should set waitforarchive to true and wait for
8646 : * them to be archived to ensure that all the required files are
8647 : * available.
8648 : *
4092 simon 8649 : * We return the current minimum recovery point as the backup end
3839 heikki.linnakangas 8650 : * location. Note that it can be greater than the exact backup end
8651 : * location if the minimum recovery point is updated after the backup of
3955 bruce 8652 : * pg_control. This is harmless for current uses.
8653 : *
4092 simon 8654 : * XXX currently a backup history file is for informational and debug
8655 : * purposes only. It's not essential for an online backup. Furthermore,
8656 : * even if it's created, it will not be archived during recovery because
8657 : * an archiver is not invoked. So it doesn't seem worthwhile to write a
8658 : * backup history file during recovery.
8659 : */
195 michael 8660 GNC 122 : if (backup_stopped_in_recovery)
4092 simon 8661 EUB : {
8662 : XLogRecPtr recptr;
8663 :
8664 : /*
8665 : * Check to see if all WAL replayed during online backup contain
8666 : * full-page writes.
8667 : */
3121 andres 8668 GIC 5 : SpinLockAcquire(&XLogCtl->info_lck);
8669 5 : recptr = XLogCtl->lastFpwDisableRecPtr;
8670 5 : SpinLockRelease(&XLogCtl->info_lck);
8671 :
195 michael 8672 GNC 5 : if (state->startpoint <= recptr)
4092 simon 8673 LBC 0 : ereport(ERROR,
8674 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8675 : errmsg("WAL generated with full_page_writes=off was replayed "
8676 : "during online backup"),
8677 : errhint("This means that the backup being taken on the standby "
8678 : "is corrupt and should not be used. "
1029 andres 8679 ECB : "Enable full_page_writes and run CHECKPOINT on the primary, "
2118 tgl 8680 : "and then try an online backup again.")));
8681 :
8682 :
4092 simon 8683 GIC 5 : LWLockAcquire(ControlFileLock, LW_SHARED);
195 michael 8684 GNC 5 : state->stoppoint = ControlFile->minRecoveryPoint;
8685 5 : state->stoptli = ControlFile->minRecoveryPointTLI;
4092 simon 8686 GIC 5 : LWLockRelease(ControlFileLock);
8687 : }
8688 : else
8689 : {
8690 : char *history_file;
8691 :
8692 : /*
2073 rhaas 8693 ECB : * Write the backup-end xlog record
8694 : */
2073 rhaas 8695 CBC 117 : XLogBeginInsert();
195 michael 8696 GNC 117 : XLogRegisterData((char *) (&state->startpoint),
8697 : sizeof(state->startpoint));
8698 117 : state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
8699 :
8700 : /*
8701 : * Given that we're not in recovery, InsertTimeLineID is set and can't
520 rhaas 8702 ECB : * change, so we can read it without a lock.
520 rhaas 8703 EUB : */
195 michael 8704 GNC 117 : state->stoptli = XLogCtl->InsertTimeLineID;
8705 :
8706 : /*
8707 : * Force a switch to a new xlog segment file, so that the backup is
8708 : * valid as soon as archiver moves out the current segment file.
8709 : */
2073 rhaas 8710 GIC 117 : RequestXLogSwitch(false);
8711 :
195 michael 8712 GNC 117 : state->stoptime = (pg_time_t) time(NULL);
8713 :
8714 : /*
8715 : * Write the backup history file
8716 : */
8717 117 : XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
8718 117 : BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
8719 : state->startpoint, wal_segment_size);
2073 rhaas 8720 GIC 117 : fp = AllocateFile(histfilepath, "w");
8721 117 : if (!fp)
2073 rhaas 8722 UIC 0 : ereport(ERROR,
8723 : (errcode_for_file_access(),
8724 : errmsg("could not create file \"%s\": %m",
8725 : histfilepath)));
8726 :
8727 : /* Build and save the contents of the backup history file */
195 michael 8728 GNC 117 : history_file = build_backup_content(state, true);
194 8729 117 : fprintf(fp, "%s", history_file);
195 8730 117 : pfree(history_file);
8731 :
2073 rhaas 8732 GIC 117 : if (fflush(fp) || ferror(fp) || FreeFile(fp))
2073 rhaas 8733 LBC 0 : ereport(ERROR,
2073 rhaas 8734 ECB : (errcode_for_file_access(),
8735 : errmsg("could not write file \"%s\": %m",
8736 : histfilepath)));
6797 bruce 8737 :
2073 rhaas 8738 EUB : /*
8739 : * Clean out any no-longer-needed history files. As a side effect,
8740 : * this will post a .ready file for the newly created history file,
8741 : * notifying the archiver that history file may be archived
8742 : * immediately.
8743 : */
2073 rhaas 8744 GIC 117 : CleanupBackupHistory();
8745 : }
8746 :
8747 : /*
4728 tgl 8748 ECB : * If archiving is enabled, wait for all the required WAL files to be
4660 bruce 8749 : * archived before returning. If archiving isn't enabled, the required WAL
8750 : * needs to be transported via streaming replication (hopefully with
993 fujii 8751 : * wal_keep_size set high enough), or some more exotic mechanism like
8752 : * polling and copying files from pg_wal with script. We have no knowledge
8753 : * of those mechanisms, so it's up to the user to ensure that he gets all
8754 : * the required WAL.
8755 : *
8756 : * We wait until both the last WAL file filled during backup and the
8757 : * history file have been archived, and assume that the alphabetic sorting
8758 : * property of the WAL files ensures any earlier WAL files are safely
8759 : * archived as well.
5482 bruce 8760 : *
5050 8761 : * We wait forever, since archive_command is supposed to work and we
8762 : * assume the admin wanted his backup to work completely. If you don't
2209 sfrost 8763 : * wish to wait, then either waitforarchive should be passed in as false,
8764 : * or you can set statement_timeout. Also, some notices are issued to
8765 : * clue in anyone who might be doing this interactively.
8766 : */
8767 :
2073 rhaas 8768 GIC 122 : if (waitforarchive &&
195 michael 8769 GNC 8 : ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
8770 1 : (backup_stopped_in_recovery && XLogArchivingAlways())))
8771 : {
8772 2 : XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
8773 2 : XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
8774 : wal_segment_size);
8775 :
8776 2 : XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
8777 2 : BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
8778 : state->startpoint, wal_segment_size);
8779 :
4660 bruce 8780 GIC 2 : seconds_before_warning = 60;
8781 2 : waits = 0;
8782 :
4660 bruce 8783 CBC 6 : while (XLogArchiveIsBusy(lastxlogfilename) ||
8784 2 : XLogArchiveIsBusy(histfilename))
8785 : {
8786 2 : CHECK_FOR_INTERRUPTS();
4739 simon 8787 ECB :
4660 bruce 8788 GBC 2 : if (!reported_waiting && waits > 5)
8789 : {
4660 bruce 8790 UIC 0 : ereport(NOTICE,
8791 : (errmsg("base backup done, waiting for required WAL segments to be archived")));
8792 0 : reported_waiting = true;
8793 : }
5482 bruce 8794 ECB :
642 michael 8795 CBC 2 : (void) WaitLatch(MyLatch,
642 michael 8796 ECB : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
8797 : 1000L,
8798 : WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
642 michael 8799 GBC 2 : ResetLatch(MyLatch);
8800 :
4660 bruce 8801 GIC 2 : if (++waits >= seconds_before_warning)
8802 : {
4660 bruce 8803 UIC 0 : seconds_before_warning *= 2; /* This wraps in >10 years... */
8804 0 : ereport(WARNING,
8805 : (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
8806 : waits),
8807 : errhint("Check that your archive_command is executing properly. "
8808 : "You can safely cancel this backup, "
8809 : "but the database backup will not be usable without all the WAL segments.")));
4660 bruce 8810 ECB : }
8811 : }
8812 :
4660 bruce 8813 GIC 2 : ereport(NOTICE,
8814 : (errmsg("all required WAL segments have been archived")));
8815 : }
4442 magnus 8816 120 : else if (waitforarchive)
4728 tgl 8817 6 : ereport(NOTICE,
8818 : (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
4473 magnus 8819 122 : }
8820 :
8821 :
8822 : /*
8823 : * do_pg_abort_backup: abort a running backup
8824 : *
8825 : * This does just the most basic steps of do_pg_backup_stop(), by taking the
8826 : * system out of backup mode, thus making it a lot more safe to call from
4473 magnus 8827 ECB : * an error handler.
4451 heikki.linnakangas 8828 : *
8829 : * 'arg' indicates that it's being called during backup setup; so
8830 : * sessionBackupState has not been modified yet, but runningBackups has
8831 : * already been incremented. When it's false, then it's invoked as a
8832 : * before_shmem_exit handler, and therefore we must not change state
8833 : * unless sessionBackupState indicates that a backup is actually running.
1207 rhaas 8834 : *
8835 : * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
8836 : * before_shmem_exit handler, hence the odd-looking signature.
8837 : */
4473 magnus 8838 : void
1207 rhaas 8839 CBC 9 : do_pg_abort_backup(int code, Datum arg)
8840 : {
172 alvherre 8841 GNC 9 : bool during_backup_start = DatumGetBool(arg);
1207 rhaas 8842 ECB :
8843 : /* If called during backup start, there shouldn't be one already running */
167 alvherre 8844 GNC 9 : Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
1937 fujii 8845 ECB :
172 alvherre 8846 GNC 9 : if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
4451 heikki.linnakangas 8847 EUB : {
172 alvherre 8848 GNC 7 : WALInsertLockAcquireExclusive();
8849 7 : Assert(XLogCtl->Insert.runningBackups > 0);
8850 7 : XLogCtl->Insert.runningBackups--;
8851 :
8852 7 : sessionBackupState = SESSION_BACKUP_NONE;
8853 7 : WALInsertLockRelease();
8854 :
8855 7 : if (!during_backup_start)
8856 7 : ereport(WARNING,
8857 : errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
8858 : }
1207 rhaas 8859 GIC 9 : }
1207 rhaas 8860 EUB :
8861 : /*
8862 : * Register a handler that will warn about unterminated backups at end of
8863 : * session, unless this has already been done.
8864 : */
8865 : void
1207 rhaas 8866 GIC 4 : register_persistent_abort_backup_handler(void)
8867 : {
8868 : static bool already_done = false;
8869 :
1207 rhaas 8870 CBC 4 : if (already_done)
1207 rhaas 8871 GIC 1 : return;
172 alvherre 8872 GNC 3 : before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
1207 rhaas 8873 CBC 3 : already_done = true;
6823 tgl 8874 ECB : }
8875 :
4832 heikki.linnakangas 8876 : /*
8877 : * Get latest WAL insert pointer
8878 : */
8879 : XLogRecPtr
4106 heikki.linnakangas 8880 GIC 4968 : GetXLogInsertRecPtr(void)
8881 : {
3121 andres 8882 4968 : XLogCtlInsert *Insert = &XLogCtl->Insert;
8883 : uint64 current_bytepos;
8884 :
3562 heikki.linnakangas 8885 4968 : SpinLockAcquire(&Insert->insertpos_lck);
8886 4968 : current_bytepos = Insert->CurrBytePos;
8887 4968 : SpinLockRelease(&Insert->insertpos_lck);
8888 :
8889 4968 : return XLogBytePosToRecPtr(current_bytepos);
8890 : }
8891 :
8892 : /*
8893 : * Get latest WAL write pointer
8894 : */
8895 : XLogRecPtr
417 heikki.linnakangas 8896 CBC 1029 : GetXLogWriteRecPtr(void)
8897 : {
8898 1029 : SpinLockAcquire(&XLogCtl->info_lck);
417 heikki.linnakangas 8899 GIC 1029 : LogwrtResult = XLogCtl->LogwrtResult;
8900 1029 : SpinLockRelease(&XLogCtl->info_lck);
2889 andrew 8901 ECB :
417 heikki.linnakangas 8902 GIC 1029 : return LogwrtResult.Write;
2889 andrew 8903 ECB : }
8904 :
6225 tgl 8905 : /*
417 heikki.linnakangas 8906 : * Returns the redo pointer of the last checkpoint or restartpoint. This is
8907 : * the oldest point in WAL that we still need, if we have to restart recovery.
8908 : */
8909 : void
417 heikki.linnakangas 8910 CBC 53 : GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
8911 : {
8912 53 : LWLockAcquire(ControlFileLock, LW_SHARED);
8913 53 : *oldrecptr = ControlFile->checkPointCopy.redo;
417 heikki.linnakangas 8914 GIC 53 : *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
8915 53 : LWLockRelease(ControlFileLock);
6225 tgl 8916 CBC 53 : }
8917 :
8918 : /* Thin wrapper around ShutdownWalRcv(). */
8919 : void
650 noah 8920 GIC 1283 : XLogShutdownWalRcv(void)
8921 : {
8922 1283 : ShutdownWalRcv();
650 noah 8923 ECB :
650 noah 8924 GIC 1283 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8925 1283 : XLogCtl->InstallXLogFileSegmentActive = false;
8926 1283 : LWLockRelease(ControlFileLock);
650 noah 8927 CBC 1283 : }
650 noah 8928 ECB :
417 heikki.linnakangas 8929 : /* Enable WAL file recycling and preallocation. */
2769 fujii 8930 : void
417 heikki.linnakangas 8931 GIC 1572 : SetInstallXLogFileSegmentActive(void)
8932 : {
8933 1572 : LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8934 1572 : XLogCtl->InstallXLogFileSegmentActive = true;
8935 1572 : LWLockRelease(ControlFileLock);
2769 fujii 8936 1572 : }
2769 fujii 8937 ECB :
8938 : bool
417 heikki.linnakangas 8939 CBC 24 : IsInstallXLogFileSegmentActive(void)
8940 : {
8941 : bool result;
3722 simon 8942 ECB :
417 heikki.linnakangas 8943 CBC 24 : LWLockAcquire(ControlFileLock, LW_SHARED);
8944 24 : result = XLogCtl->InstallXLogFileSegmentActive;
417 heikki.linnakangas 8945 GIC 24 : LWLockRelease(ControlFileLock);
4436 rhaas 8946 ECB :
417 heikki.linnakangas 8947 GIC 24 : return result;
8948 : }
8949 :
8950 : /*
8951 : * Update the WalWriterSleeping flag.
8952 : */
3988 tgl 8953 ECB : void
3988 tgl 8954 GIC 366 : SetWalWriterSleeping(bool sleeping)
3988 tgl 8955 ECB : {
3121 andres 8956 CBC 366 : SpinLockAcquire(&XLogCtl->info_lck);
8957 366 : XLogCtl->WalWriterSleeping = sleeping;
3121 andres 8958 GIC 366 : SpinLockRelease(&XLogCtl->info_lck);
3988 tgl 8959 CBC 366 : }
|