Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * checkpointer.c
4 : *
5 : * The checkpointer is new as of Postgres 9.2. It handles all checkpoints.
6 : * Checkpoints are automatically dispatched after a certain amount of time has
7 : * elapsed since the last one, and it can be signaled to perform requested
8 : * checkpoints as well. (The GUC parameter that mandates a checkpoint every
9 : * so many WAL segments is implemented by having backends signal when they
10 : * fill WAL segments; the checkpointer itself doesn't watch for the
11 : * condition.)
12 : *
13 : * Normal termination is by SIGUSR2, which instructs the checkpointer to
14 : * execute a shutdown checkpoint and then exit(0). (All backends must be
15 : * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT;
16 : * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
17 : *
18 : * If the checkpointer exits unexpectedly, the postmaster treats that the same
19 : * as a backend crash: shared memory may be corrupted, so remaining backends
20 : * should be killed by SIGQUIT and then a recovery cycle started. (Even if
21 : * shared memory isn't corrupted, we have lost information about which
22 : * files need to be fsync'd for the next checkpoint, and so a system
23 : * restart needs to be forced.)
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27 : *
28 : *
29 : * IDENTIFICATION
30 : * src/backend/postmaster/checkpointer.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include <sys/time.h>
37 : #include <time.h>
38 :
39 : #include "access/xlog.h"
40 : #include "access/xlog_internal.h"
41 : #include "access/xlogrecovery.h"
42 : #include "libpq/pqsignal.h"
43 : #include "miscadmin.h"
44 : #include "pgstat.h"
45 : #include "postmaster/bgwriter.h"
46 : #include "postmaster/interrupt.h"
47 : #include "replication/syncrep.h"
48 : #include "storage/bufmgr.h"
49 : #include "storage/condition_variable.h"
50 : #include "storage/fd.h"
51 : #include "storage/ipc.h"
52 : #include "storage/lwlock.h"
53 : #include "storage/proc.h"
54 : #include "storage/procsignal.h"
55 : #include "storage/shmem.h"
56 : #include "storage/smgr.h"
57 : #include "storage/spin.h"
58 : #include "utils/guc.h"
59 : #include "utils/memutils.h"
60 : #include "utils/resowner.h"
61 :
62 :
63 : /*----------
64 : * Shared memory area for communication between checkpointer and backends
65 : *
66 : * The ckpt counters allow backends to watch for completion of a checkpoint
67 : * request they send. Here's how it works:
68 : * * At start of a checkpoint, checkpointer reads (and clears) the request
69 : * flags and increments ckpt_started, while holding ckpt_lck.
70 : * * On completion of a checkpoint, checkpointer sets ckpt_done to
71 : * equal ckpt_started.
72 : * * On failure of a checkpoint, checkpointer increments ckpt_failed
73 : * and sets ckpt_done to equal ckpt_started.
74 : *
75 : * The algorithm for backends is:
76 : * 1. Record current values of ckpt_failed and ckpt_started, and
77 : * set request flags, while holding ckpt_lck.
78 : * 2. Send signal to request checkpoint.
79 : * 3. Sleep until ckpt_started changes. Now you know a checkpoint has
80 : * begun since you started this algorithm (although *not* that it was
81 : * specifically initiated by your signal), and that it is using your flags.
82 : * 4. Record new value of ckpt_started.
83 : * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
84 : * arithmetic here in case counters wrap around.) Now you know a
85 : * checkpoint has started and completed, but not whether it was
86 : * successful.
87 : * 6. If ckpt_failed is different from the originally saved value,
88 : * assume request failed; otherwise it was definitely successful.
89 : *
90 : * ckpt_flags holds the OR of the checkpoint request flags sent by all
91 : * requesting backends since the last checkpoint start. The flags are
92 : * chosen so that OR'ing is the correct way to combine multiple requests.
93 : *
94 : * num_backend_writes is used to count the number of buffer writes performed
95 : * by user backend processes. This counter should be wide enough that it
96 : * can't overflow during a single processing cycle. num_backend_fsync
97 : * counts the subset of those writes that also had to do their own fsync,
98 : * because the checkpointer failed to absorb their request.
99 : *
100 : * The requests array holds fsync requests sent by backends and not yet
101 : * absorbed by the checkpointer.
102 : *
103 : * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and
104 : * the requests fields are protected by CheckpointerCommLock.
105 : *----------
106 : */
107 : typedef struct
108 : {
109 : SyncRequestType type; /* request type */
110 : FileTag ftag; /* file identifier */
111 : } CheckpointerRequest;
112 :
113 : typedef struct
114 : {
115 : pid_t checkpointer_pid; /* PID (0 if not started) */
116 :
117 : slock_t ckpt_lck; /* protects all the ckpt_* fields */
118 :
119 : int ckpt_started; /* advances when checkpoint starts */
120 : int ckpt_done; /* advances when checkpoint done */
121 : int ckpt_failed; /* advances when checkpoint fails */
122 :
123 : int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
124 :
125 : ConditionVariable start_cv; /* signaled when ckpt_started advances */
126 : ConditionVariable done_cv; /* signaled when ckpt_done advances */
127 :
128 : uint32 num_backend_writes; /* counts user backend buffer writes */
129 : uint32 num_backend_fsync; /* counts user backend fsync calls */
130 :
131 : int num_requests; /* current # of requests */
132 : int max_requests; /* allocated array size */
133 : CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
134 : } CheckpointerShmemStruct;
135 :
136 : static CheckpointerShmemStruct *CheckpointerShmem;
137 :
138 : /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */
139 : #define WRITES_PER_ABSORB 1000
140 :
141 : /*
142 : * GUC parameters
143 : */
144 : int CheckPointTimeout = 300;
145 : int CheckPointWarning = 30;
146 : double CheckPointCompletionTarget = 0.9;
147 :
148 : /*
149 : * Private state
150 : */
151 : static bool ckpt_active = false;
152 :
153 : /* these values are valid when ckpt_active is true: */
154 : static pg_time_t ckpt_start_time;
155 : static XLogRecPtr ckpt_start_recptr;
156 : static double ckpt_cached_elapsed;
157 :
158 : static pg_time_t last_checkpoint_time;
159 : static pg_time_t last_xlog_switch_time;
160 :
161 : /* Prototypes for private functions */
162 :
163 : static void HandleCheckpointerInterrupts(void);
164 : static void CheckArchiveTimeout(void);
165 : static bool IsCheckpointOnSchedule(double progress);
166 : static bool ImmediateCheckpointRequested(void);
167 : static bool CompactCheckpointerRequestQueue(void);
168 : static void UpdateSharedMemoryConfig(void);
169 :
170 : /* Signal handlers */
171 : static void ReqCheckpointHandler(SIGNAL_ARGS);
172 :
173 :
174 : /*
175 : * Main entry point for checkpointer process
176 : *
177 : * This is invoked from AuxiliaryProcessMain, which has already created the
178 : * basic execution environment, but not enabled signals yet.
179 : */
180 : void
4177 simon 181 CBC 355 : CheckpointerMain(void)
182 : {
183 : sigjmp_buf local_sigjmp_buf;
184 : MemoryContext checkpointer_context;
185 :
3987 186 355 : CheckpointerShmem->checkpointer_pid = MyProcPid;
187 :
188 : /*
189 : * Properly accept or ignore signals the postmaster might send us
190 : *
191 : * Note: we deliberately ignore SIGTERM, because during a standard Unix
192 : * system shutdown cycle, init will SIGTERM all processes at once. We
193 : * want to wait for the backends to exit, whereupon the postmaster will
194 : * tell us it's okay to shut down (via SIGUSR2).
195 : */
1209 rhaas 196 355 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
2118 tgl 197 355 : pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */
3955 bruce 198 355 : pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
199 : /* SIGQUIT handler was already set up by InitPostmasterChild */
4177 simon 200 355 : pqsignal(SIGALRM, SIG_IGN);
201 355 : pqsignal(SIGPIPE, SIG_IGN);
1231 rhaas 202 355 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1209 203 355 : pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
204 :
205 : /*
206 : * Reset some signals that are accepted by postmaster but not here
207 : */
4177 simon 208 355 : pqsignal(SIGCHLD, SIG_DFL);
209 :
210 : /*
211 : * Initialize so that first time-driven event happens at the correct time.
212 : */
213 355 : last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
214 :
215 : /*
216 : * Write out stats after shutdown. This needs to be called by exactly one
217 : * process during a normal shutdown, and since checkpointer is shut down
218 : * very late...
219 : *
220 : * Walsenders are shut down after the checkpointer, but currently don't
221 : * report stats. If that changes, we need a more complicated solution.
222 : */
368 andres 223 355 : before_shmem_exit(pgstat_before_server_shutdown, 0);
224 :
225 : /*
226 : * Create a memory context that we will do all our work in. We do this so
227 : * that we can reset the context during error recovery and thereby avoid
228 : * possible memory leaks. Formerly this code just ran in
229 : * TopMemoryContext, but resetting that would be a really bad idea.
230 : */
4177 simon 231 355 : checkpointer_context = AllocSetContextCreate(TopMemoryContext,
232 : "Checkpointer",
233 : ALLOCSET_DEFAULT_SIZES);
234 355 : MemoryContextSwitchTo(checkpointer_context);
235 :
236 : /*
237 : * If an exception is encountered, processing resumes here.
238 : *
239 : * You might wonder why this isn't coded as an infinite loop around a
240 : * PG_TRY construct. The reason is that this is the bottom of the
241 : * exception stack, and so with PG_TRY there would be no exception handler
242 : * in force at all during the CATCH part. By leaving the outermost setjmp
243 : * always active, we have at least some chance of recovering from an error
244 : * during error recovery. (If we get into an infinite loop thereby, it
245 : * will soon be stopped by overflow of elog.c's internal state stack.)
246 : *
247 : * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
248 : * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus,
249 : * signals other than SIGQUIT will be blocked until we complete error
250 : * recovery. It might seem that this policy makes the HOLD_INTERRUPTS()
251 : * call redundant, but it is not since InterruptPending might be set
252 : * already.
253 : */
254 355 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
255 : {
256 : /* Since not using PG_TRY, must reset error stack by hand */
4177 simon 257 UBC 0 : error_context_stack = NULL;
258 :
259 : /* Prevent interrupts while cleaning up */
260 0 : HOLD_INTERRUPTS();
261 :
262 : /* Report the error to the server log */
263 0 : EmitErrorReport();
264 :
265 : /*
266 : * These operations are really just a minimal subset of
267 : * AbortTransaction(). We don't have very many resources to worry
268 : * about in checkpointer, but we do have LWLocks, buffers, and temp
269 : * files.
270 : */
271 0 : LWLockReleaseAll();
2329 rhaas 272 0 : ConditionVariableCancelSleep();
2586 273 0 : pgstat_report_wait_end();
4177 simon 274 0 : UnlockBuffers();
1726 tgl 275 0 : ReleaseAuxProcessResources(false);
4177 simon 276 0 : AtEOXact_Buffers(false);
3826 tgl 277 0 : AtEOXact_SMgr();
1807 278 0 : AtEOXact_Files(false);
4177 simon 279 UIC 0 : AtEOXact_HashTables(false);
280 :
4177 simon 281 EUB : /* Warn any waiting backends that the checkpoint failed. */
4177 simon 282 UIC 0 : if (ckpt_active)
4177 simon 283 EUB : {
2742 rhaas 284 UBC 0 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
285 0 : CheckpointerShmem->ckpt_failed++;
286 0 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
2742 rhaas 287 UIC 0 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
4177 simon 288 EUB :
1464 tmunro 289 UIC 0 : ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
1464 tmunro 290 EUB :
4177 simon 291 UIC 0 : ckpt_active = false;
292 : }
293 :
294 : /*
295 : * Now return to normal top-level context and clear ErrorContext for
296 : * next time.
4177 simon 297 EUB : */
4177 simon 298 UBC 0 : MemoryContextSwitchTo(checkpointer_context);
4177 simon 299 UIC 0 : FlushErrorState();
300 :
4177 simon 301 EUB : /* Flush any leaked data in the top-level context */
4177 simon 302 UIC 0 : MemoryContextResetAndDeleteChildren(checkpointer_context);
303 :
4177 simon 304 EUB : /* Now we can allow interrupts again */
4177 simon 305 UIC 0 : RESUME_INTERRUPTS();
306 :
307 : /*
308 : * Sleep at least 1 second after any error. A write error is likely
309 : * to be repeated, and we don't want to be filling the error logs as
310 : * fast as we can.
4177 simon 311 EUB : */
4177 simon 312 UIC 0 : pg_usleep(1000000L);
313 :
314 : /*
315 : * Close all open files after any error. This is helpful on Windows,
316 : * where holding deleted files open causes various strange errors.
317 : * It's not clear we need it elsewhere, but shouldn't hurt.
4177 simon 318 EUB : */
4177 simon 319 UIC 0 : smgrcloseall();
320 : }
321 :
4177 simon 322 ECB : /* We can now handle ereport(ERROR) */
4177 simon 323 GIC 355 : PG_exception_stack = &local_sigjmp_buf;
324 :
325 : /*
326 : * Unblock signals (they were blocked when the postmaster forked us)
4177 simon 327 ECB : */
65 tmunro 328 GNC 355 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
329 :
330 : /*
331 : * Ensure all shared memory values are set correctly for the config. Doing
332 : * this here ensures no race conditions from other concurrent updaters.
4092 simon 333 ECB : */
4092 simon 334 GIC 355 : UpdateSharedMemoryConfig();
335 :
336 : /*
337 : * Advertise our latch that backends can use to wake us up while we're
338 : * sleeping.
3988 tgl 339 ECB : */
3988 tgl 340 GIC 355 : ProcGlobal->checkpointerLatch = &MyProc->procLatch;
341 :
342 : /*
343 : * Loop forever
344 : */
4177 simon 345 ECB : for (;;)
4177 simon 346 CBC 4692 : {
347 5047 : bool do_checkpoint = false;
4177 simon 348 GIC 5047 : int flags = 0;
349 : pg_time_t now;
350 : int elapsed_secs;
351 : int cur_timeout;
352 :
3988 tgl 353 ECB : /* Clear any already-pending wakeups */
3007 andres 354 GIC 5047 : ResetLatch(MyLatch);
355 :
356 : /*
357 : * Process any requests or signals received recently.
4177 simon 358 ECB : */
1466 tmunro 359 CBC 5047 : AbsorbSyncRequests();
1209 rhaas 360 GIC 5047 : HandleCheckpointerInterrupts();
361 :
362 : /*
363 : * Detect a pending checkpoint request by checking whether the flags
364 : * word in shared memory is nonzero. We shouldn't need to acquire the
365 : * ckpt_lck for this.
1482 tgl 366 ECB : */
1482 tgl 367 GIC 4703 : if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
1482 tgl 368 ECB : {
1482 tgl 369 CBC 219 : do_checkpoint = true;
368 andres 370 GIC 219 : PendingCheckpointerStats.requested_checkpoints++;
371 : }
372 :
373 : /*
374 : * Force a checkpoint if too much time has elapsed since the last one.
375 : * Note that we count a timed checkpoint in stats only when this
376 : * occurs without an external request, but we set the CAUSE_TIME flag
377 : * bit even if there is also an external request.
4177 simon 378 ECB : */
4177 simon 379 CBC 4703 : now = (pg_time_t) time(NULL);
380 4703 : elapsed_secs = now - last_checkpoint_time;
4177 simon 381 GIC 4703 : if (elapsed_secs >= CheckPointTimeout)
4177 simon 382 EUB : {
4177 simon 383 UBC 0 : if (!do_checkpoint)
368 andres 384 0 : PendingCheckpointerStats.timed_checkpoints++;
4177 simon 385 0 : do_checkpoint = true;
4177 simon 386 UIC 0 : flags |= CHECKPOINT_CAUSE_TIME;
387 : }
388 :
389 : /*
390 : * Do a checkpoint if requested.
4177 simon 391 ECB : */
4177 simon 392 GIC 4703 : if (do_checkpoint)
4177 simon 393 ECB : {
4177 simon 394 GIC 219 : bool ckpt_performed = false;
395 : bool do_restartpoint;
396 :
494 dgustafsson 397 ECB : /* Check if we should perform a checkpoint or a restartpoint. */
4177 simon 398 GIC 219 : do_restartpoint = RecoveryInProgress();
399 :
400 : /*
401 : * Atomically fetch the request flags to figure out what kind of a
402 : * checkpoint we should perform, and increase the started-counter
403 : * to acknowledge that we've started a new checkpoint.
4177 simon 404 ECB : */
2742 rhaas 405 CBC 219 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
406 219 : flags |= CheckpointerShmem->ckpt_flags;
407 219 : CheckpointerShmem->ckpt_flags = 0;
408 219 : CheckpointerShmem->ckpt_started++;
2742 rhaas 409 GIC 219 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
4177 simon 410 ECB :
1487 tmunro 411 GIC 219 : ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
412 :
413 : /*
414 : * The end-of-recovery checkpoint is a real checkpoint that's
415 : * performed while we're still in recovery.
4177 simon 416 ECB : */
4177 simon 417 CBC 219 : if (flags & CHECKPOINT_END_OF_RECOVERY)
4177 simon 418 GIC 18 : do_restartpoint = false;
419 :
420 : /*
421 : * We will warn if (a) too soon since last checkpoint (whatever
422 : * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
423 : * since the last checkpoint start. Note in particular that this
424 : * implementation will not generate warnings caused by
425 : * CheckPointTimeout < CheckPointWarning.
4177 simon 426 ECB : */
4177 simon 427 CBC 219 : if (!do_restartpoint &&
428 179 : (flags & CHECKPOINT_CAUSE_XLOG) &&
429 30 : elapsed_secs < CheckPointWarning)
4177 simon 430 GIC 29 : ereport(LOG,
431 : (errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
432 : "checkpoints are occurring too frequently (%d seconds apart)",
433 : elapsed_secs,
434 : elapsed_secs),
435 : errhint("Consider increasing the configuration parameter \"max_wal_size\".")));
436 :
437 : /*
438 : * Initialize checkpointer-private variables used during
439 : * checkpoint.
4177 simon 440 ECB : */
4177 simon 441 CBC 219 : ckpt_active = true;
2841 heikki.linnakangas 442 219 : if (do_restartpoint)
2841 heikki.linnakangas 443 GIC 40 : ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
2841 heikki.linnakangas 444 ECB : else
4177 simon 445 CBC 179 : ckpt_start_recptr = GetInsertRecPtr();
446 219 : ckpt_start_time = now;
4177 simon 447 GIC 219 : ckpt_cached_elapsed = 0;
448 :
449 : /*
450 : * Do the checkpoint.
4177 simon 451 ECB : */
4177 simon 452 GIC 219 : if (!do_restartpoint)
4177 simon 453 ECB : {
4177 simon 454 CBC 179 : CreateCheckPoint(flags);
4177 simon 455 GIC 179 : ckpt_performed = true;
456 : }
4177 simon 457 ECB : else
4177 simon 458 GIC 40 : ckpt_performed = CreateRestartPoint(flags);
459 :
460 : /*
461 : * After any checkpoint, close all smgr files. This is so we
462 : * won't hang onto smgr references to deleted files indefinitely.
4177 simon 463 ECB : */
4177 simon 464 GIC 219 : smgrcloseall();
465 :
466 : /*
467 : * Indicate checkpoint completion to any waiting backends.
4177 simon 468 ECB : */
2742 rhaas 469 CBC 219 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
470 219 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
2742 rhaas 471 GIC 219 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
4177 simon 472 ECB :
1487 tmunro 473 GIC 219 : ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
1487 tmunro 474 ECB :
4177 simon 475 GIC 219 : if (ckpt_performed)
476 : {
477 : /*
478 : * Note we record the checkpoint start time not end time as
479 : * last_checkpoint_time. This is so that time-driven
480 : * checkpoints happen at a predictable spacing.
4177 simon 481 ECB : */
4177 simon 482 GIC 194 : last_checkpoint_time = now;
483 : }
484 : else
485 : {
486 : /*
487 : * We were not able to perform the restartpoint (checkpoints
488 : * throw an ERROR in case of error). Most likely because we
489 : * have not received any new checkpoint WAL records since the
490 : * last restartpoint. Try again in 15 s.
4177 simon 491 ECB : */
4177 simon 492 GIC 25 : last_checkpoint_time = now - CheckPointTimeout + 15;
493 : }
4177 simon 494 ECB :
4177 simon 495 GIC 219 : ckpt_active = false;
496 :
389 tmunro 497 ECB : /* We may have received an interrupt during the checkpoint. */
389 tmunro 498 GIC 219 : HandleCheckpointerInterrupts();
499 : }
500 :
3988 tgl 501 ECB : /* Check for archive_timeout and switch xlog files if necessary. */
3988 tgl 502 GIC 4695 : CheckArchiveTimeout();
503 :
368 andres 504 ECB : /* Report pending statistics to the cumulative stats system */
368 andres 505 CBC 4695 : pgstat_report_checkpointer();
368 andres 506 GIC 4695 : pgstat_report_wal(true);
507 :
508 : /*
509 : * If any checkpoint flags have been set, redo the loop to handle the
510 : * checkpoint without sleeping.
1075 alvherre 511 ECB : */
1075 alvherre 512 CBC 4695 : if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
1075 alvherre 513 GIC 15 : continue;
514 :
515 : /*
516 : * Sleep until we are signaled or it's time for another checkpoint or
517 : * xlog file switch.
4177 simon 518 ECB : */
3988 tgl 519 CBC 4680 : now = (pg_time_t) time(NULL);
520 4680 : elapsed_secs = now - last_checkpoint_time;
3988 tgl 521 GBC 4680 : if (elapsed_secs >= CheckPointTimeout)
3988 tgl 522 LBC 0 : continue; /* no sleep for us ... */
3988 tgl 523 CBC 4680 : cur_timeout = CheckPointTimeout - elapsed_secs;
3988 tgl 524 GIC 4680 : if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
3988 tgl 525 EUB : {
3988 tgl 526 UBC 0 : elapsed_secs = now - last_xlog_switch_time;
527 0 : if (elapsed_secs >= XLogArchiveTimeout)
528 0 : continue; /* no sleep for us ... */
3988 tgl 529 UIC 0 : cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
530 : }
4177 simon 531 ECB :
1598 tmunro 532 GIC 4680 : (void) WaitLatch(MyLatch,
533 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
534 : cur_timeout * 1000L /* convert to ms */ ,
535 : WAIT_EVENT_CHECKPOINTER_MAIN);
536 : }
537 : }
538 :
539 : /*
540 : * Process any new interrupts.
541 : */
1209 rhaas 542 ECB : static void
1209 rhaas 543 GIC 5266 : HandleCheckpointerInterrupts(void)
1209 rhaas 544 ECB : {
1207 rhaas 545 CBC 5266 : if (ProcSignalBarrierPending)
1207 rhaas 546 GIC 36 : ProcessProcSignalBarrier();
1207 rhaas 547 ECB :
1209 rhaas 548 GIC 5266 : if (ConfigReloadPending)
1209 rhaas 549 ECB : {
1209 rhaas 550 CBC 35 : ConfigReloadPending = false;
1209 rhaas 551 GIC 35 : ProcessConfigFile(PGC_SIGHUP);
552 :
553 : /*
554 : * Checkpointer is the last process to shut down, so we ask it to hold
555 : * the keys for a range of other tasks required most of which have
556 : * nothing to do with checkpointing at all.
557 : *
558 : * For various reasons, some config values can change dynamically so
559 : * the primary copy of them is held in shared memory to make sure all
560 : * backends see the same value. We make Checkpointer responsible for
561 : * updating the shared memory copy if the parameter setting changes
562 : * because of SIGHUP.
1209 rhaas 563 ECB : */
1209 rhaas 564 GIC 35 : UpdateSharedMemoryConfig();
1209 rhaas 565 ECB : }
1209 rhaas 566 GIC 5266 : if (ShutdownRequestPending)
567 : {
568 : /*
569 : * From here on, elog(ERROR) should end with exit(1), not send control
570 : * back to the sigsetjmp block above
1209 rhaas 571 ECB : */
1209 rhaas 572 GIC 352 : ExitOnAnyError = true;
573 :
574 : /*
575 : * Close down the database.
576 : *
577 : * Since ShutdownXLOG() creates restartpoint or checkpoint, and
578 : * updates the statistics, increment the checkpoint request and flush
579 : * out pending statistic.
758 fujii 580 ECB : */
368 andres 581 CBC 352 : PendingCheckpointerStats.requested_checkpoints++;
1209 rhaas 582 352 : ShutdownXLOG(0, 0);
368 andres 583 352 : pgstat_report_checkpointer();
368 andres 584 GIC 352 : pgstat_report_wal(true);
585 :
1209 rhaas 586 ECB : /* Normal exit from the checkpointer is here */
1060 tgl 587 GIC 352 : proc_exit(0); /* done */
588 : }
589 :
453 fujii 590 ECB : /* Perform logging of memory contexts of this process */
453 fujii 591 CBC 4914 : if (LogMemoryContextPending)
592 2 : ProcessLogMemoryContextInterrupt();
1209 rhaas 593 GIC 4914 : }
594 :
595 : /*
596 : * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
597 : *
598 : * This will switch to a new WAL file and force an archive file write if
599 : * meaningful activity is recorded in the current WAL file. This includes most
600 : * writes, including just a single checkpoint record, but excludes WAL records
601 : * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
602 : * snapshots of running transactions). Such records, depending on
603 : * configuration, occur on regular intervals and don't contain important
604 : * information. This avoids generating archives with a few unimportant
605 : * records.
606 : */
4177 simon 607 ECB : static void
4177 simon 608 GIC 6755 : CheckArchiveTimeout(void)
609 : {
610 : pg_time_t now;
611 : pg_time_t last_time;
612 : XLogRecPtr last_switch_lsn;
4177 simon 613 ECB :
4177 simon 614 CBC 6755 : if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
4177 simon 615 GIC 6755 : return;
4177 simon 616 EUB :
4177 simon 617 UIC 0 : now = (pg_time_t) time(NULL);
618 :
4177 simon 619 EUB : /* First we do a quick check using possibly-stale local state. */
4177 simon 620 UBC 0 : if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
4177 simon 621 UIC 0 : return;
622 :
623 : /*
624 : * Update local state ... note that last_xlog_switch_time is the last time
625 : * a switch was performed *or requested*.
4177 simon 626 EUB : */
2299 andres 627 UIC 0 : last_time = GetLastSegSwitchData(&last_switch_lsn);
4177 simon 628 EUB :
4177 simon 629 UIC 0 : last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
630 :
2299 andres 631 EUB : /* Now we can do the real checks */
4177 simon 632 UIC 0 : if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
633 : {
634 : /*
635 : * Switch segment only when "important" WAL has been logged since the
636 : * last segment switch (last_switch_lsn points to end of segment
637 : * switch occurred in).
4177 simon 638 EUB : */
2299 andres 639 UIC 0 : if (GetLastImportantRecPtr() > last_switch_lsn)
640 : {
641 : XLogRecPtr switchpoint;
642 :
2299 andres 643 EUB : /* mark switch as unimportant, avoids triggering checkpoints */
2299 andres 644 UIC 0 : switchpoint = RequestXLogSwitch(true);
645 :
646 : /*
647 : * If the returned pointer points exactly to a segment boundary,
648 : * assume nothing happened.
2299 andres 649 EUB : */
2028 andres 650 UBC 0 : if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0)
2158 peter_e 651 UIC 0 : elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)",
652 : XLogArchiveTimeout);
653 : }
654 :
655 : /*
656 : * Update state in any case, so we don't retry constantly when the
657 : * system is idle.
4177 simon 658 EUB : */
4177 simon 659 UIC 0 : last_xlog_switch_time = now;
660 : }
661 : }
662 :
663 : /*
664 : * Returns true if an immediate checkpoint request is pending. (Note that
665 : * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
666 : * there is one pending behind it.)
667 : */
4177 simon 668 ECB : static bool
4177 simon 669 GIC 23755 : ImmediateCheckpointRequested(void)
4177 simon 670 ECB : {
1482 tgl 671 GIC 23755 : volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
672 :
673 : /*
674 : * We don't need to acquire the ckpt_lck in this case because we're only
675 : * looking at a single flag bit.
1482 tgl 676 ECB : */
1482 tgl 677 CBC 23755 : if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
678 431 : return true;
4177 simon 679 GIC 23324 : return false;
680 : }
681 :
682 : /*
683 : * CheckpointWriteDelay -- control rate of checkpoint
684 : *
685 : * This function is called after each page write performed by BufferSync().
686 : * It is responsible for throttling BufferSync()'s write rate to hit
687 : * checkpoint_completion_target.
688 : *
689 : * The checkpoint request flags should be passed in; currently the only one
690 : * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
691 : *
692 : * 'progress' is an estimate of how much of the work has been done, as a
693 : * fraction between 0.0 meaning none, and 1.0 meaning all done.
694 : */
4177 simon 695 ECB : void
4177 simon 696 GIC 477090 : CheckpointWriteDelay(int flags, double progress)
697 : {
698 : static int absorb_counter = WRITES_PER_ABSORB;
699 :
4177 simon 700 ECB : /* Do nothing if checkpoint is being executed by non-checkpointer process */
3917 tgl 701 CBC 477090 : if (!AmCheckpointerProcess())
4177 simon 702 GIC 313146 : return;
703 :
704 : /*
705 : * Perform the usual duties and take a nap, unless we're behind schedule,
706 : * in which case we just try to catch up as quickly as possible.
4177 simon 707 ECB : */
4177 simon 708 CBC 163944 : if (!(flags & CHECKPOINT_IMMEDIATE) &&
1209 rhaas 709 26063 : !ShutdownRequestPending &&
4177 simon 710 47079 : !ImmediateCheckpointRequested() &&
4177 simon 711 GIC 23324 : IsCheckpointOnSchedule(progress))
4177 simon 712 ECB : {
1209 rhaas 713 GIC 2060 : if (ConfigReloadPending)
4177 simon 714 ECB : {
1209 rhaas 715 CBC 1 : ConfigReloadPending = false;
4177 simon 716 GIC 1 : ProcessConfigFile(PGC_SIGHUP);
3986 tgl 717 ECB : /* update shmem copies of config variables */
4087 simon 718 GIC 1 : UpdateSharedMemoryConfig();
719 : }
4177 simon 720 ECB :
1466 tmunro 721 CBC 2060 : AbsorbSyncRequests();
4177 simon 722 GIC 2060 : absorb_counter = WRITES_PER_ABSORB;
4177 simon 723 ECB :
4177 simon 724 GIC 2060 : CheckArchiveTimeout();
725 :
368 andres 726 ECB : /* Report interim statistics to the cumulative stats system */
368 andres 727 GIC 2060 : pgstat_report_checkpointer();
728 :
729 : /*
730 : * This sleep used to be connected to bgwriter_delay, typically 200ms.
731 : * That resulted in more frequent wakeups if not much work to do.
732 : * Checkpointer and bgwriter are no longer related so take the Big
733 : * Sleep.
4177 simon 734 ECB : */
389 tmunro 735 GIC 2060 : WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
736 : 100,
389 tmunro 737 ECB : WAIT_EVENT_CHECKPOINT_WRITE_DELAY);
389 tmunro 738 GIC 2060 : ResetLatch(MyLatch);
4177 simon 739 ECB : }
4177 simon 740 GIC 161884 : else if (--absorb_counter <= 0)
741 : {
742 : /*
743 : * Absorb pending fsync requests after each WRITES_PER_ABSORB write
744 : * operations even when we don't sleep, to prevent overflow of the
745 : * fsync request queue.
4177 simon 746 ECB : */
1466 tmunro 747 CBC 64 : AbsorbSyncRequests();
4177 simon 748 GIC 64 : absorb_counter = WRITES_PER_ABSORB;
749 : }
750 :
1207 rhaas 751 ECB : /* Check for barrier events. */
1207 rhaas 752 CBC 163944 : if (ProcSignalBarrierPending)
1207 rhaas 753 GIC 2 : ProcessProcSignalBarrier();
754 : }
755 :
756 : /*
757 : * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
758 : * (or restartpoint) in time?
759 : *
760 : * Compares the current progress against the time/segments elapsed since last
761 : * checkpoint, and returns true if the progress we've made this far is greater
762 : * than the elapsed time/segments.
763 : */
4177 simon 764 ECB : static bool
4177 simon 765 GIC 23324 : IsCheckpointOnSchedule(double progress)
766 : {
767 : XLogRecPtr recptr;
768 : struct timeval now;
769 : double elapsed_xlogs,
770 : elapsed_time;
4177 simon 771 ECB :
4177 simon 772 GIC 23324 : Assert(ckpt_active);
773 :
4177 simon 774 ECB : /* Scale progress according to checkpoint_completion_target. */
4177 simon 775 GIC 23324 : progress *= CheckPointCompletionTarget;
776 :
777 : /*
778 : * Check against the cached value first. Only do the more expensive
779 : * calculations once we reach the target previously calculated. Since
780 : * neither time or WAL insert pointer moves backwards, a freshly
781 : * calculated value can only be greater than or equal to the cached value.
4177 simon 782 ECB : */
4177 simon 783 CBC 23324 : if (progress < ckpt_cached_elapsed)
4177 simon 784 GIC 20624 : return false;
785 :
786 : /*
787 : * Check progress against WAL segments written and CheckPointSegments.
788 : *
789 : * We compare the current WAL insert location against the location
790 : * computed before calling CreateCheckPoint. The code in XLogInsert that
791 : * actually triggers a checkpoint when CheckPointSegments is exceeded
792 : * compares against RedoRecPtr, so this is not completely accurate.
793 : * However, it's good enough for our purposes, we're only calculating an
794 : * estimate anyway.
795 : *
796 : * During recovery, we compare last replayed WAL record's location with
797 : * the location computed before calling CreateRestartPoint. That maintains
798 : * the same pacing as we have during checkpoints in normal operation, but
799 : * we might exceed max_wal_size by a fair amount. That's because there can
800 : * be a large gap between a checkpoint's redo-pointer and the checkpoint
801 : * record itself, and we only start the restartpoint after we've seen the
802 : * checkpoint record. (The gap is typically up to CheckPointSegments *
803 : * checkpoint_completion_target where checkpoint_completion_target is the
804 : * value that was in effect when the WAL was generated).
4177 simon 805 ECB : */
2841 heikki.linnakangas 806 CBC 2700 : if (RecoveryInProgress())
2841 heikki.linnakangas 807 GIC 497 : recptr = GetXLogReplayRecPtr(NULL);
2841 heikki.linnakangas 808 ECB : else
4177 simon 809 CBC 2203 : recptr = GetInsertRecPtr();
2028 andres 810 2700 : elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) /
2028 andres 811 GIC 2700 : wal_segment_size) / CheckPointSegments;
4177 simon 812 ECB :
2841 heikki.linnakangas 813 GIC 2700 : if (progress < elapsed_xlogs)
2841 heikki.linnakangas 814 ECB : {
2841 heikki.linnakangas 815 CBC 638 : ckpt_cached_elapsed = elapsed_xlogs;
2841 heikki.linnakangas 816 GIC 638 : return false;
817 : }
818 :
819 : /*
820 : * Check progress against time elapsed and checkpoint_timeout.
4177 simon 821 ECB : */
4177 simon 822 CBC 2062 : gettimeofday(&now, NULL);
823 2062 : elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
4177 simon 824 GIC 2062 : now.tv_usec / 1000000.0) / CheckPointTimeout;
4177 simon 825 ECB :
4177 simon 826 GIC 2062 : if (progress < elapsed_time)
4177 simon 827 ECB : {
4177 simon 828 CBC 2 : ckpt_cached_elapsed = elapsed_time;
4177 simon 829 GIC 2 : return false;
830 : }
831 :
4177 simon 832 ECB : /* It looks like we're on schedule. */
4177 simon 833 GIC 2060 : return true;
834 : }
835 :
836 :
837 : /* --------------------------------
838 : * signal handler routines
839 : * --------------------------------
840 : */
841 :
842 : /* SIGINT: set flag to run a normal checkpoint right away */
4177 simon 843 ECB : static void
4177 simon 844 GIC 224 : ReqCheckpointHandler(SIGNAL_ARGS)
4177 simon 845 ECB : {
3988 tgl 846 GIC 224 : int save_errno = errno;
847 :
848 : /*
849 : * The signaling process should have set ckpt_flags nonzero, so all we
850 : * need do is ensure that our main loop gets kicked out of any wait.
1482 tgl 851 ECB : */
3007 andres 852 GIC 224 : SetLatch(MyLatch);
3988 tgl 853 ECB :
3988 tgl 854 CBC 224 : errno = save_errno;
3988 tgl 855 GIC 224 : }
856 :
857 :
858 : /* --------------------------------
859 : * communication with backends
860 : * --------------------------------
861 : */
862 :
863 : /*
864 : * CheckpointerShmemSize
865 : * Compute space needed for checkpointer-related shared memory
866 : */
4177 simon 867 ECB : Size
3987 simon 868 GIC 4564 : CheckpointerShmemSize(void)
869 : {
870 : Size size;
871 :
872 : /*
873 : * Currently, the size of the requests[] array is arbitrarily set equal to
874 : * NBuffers. This may prove too large or small ...
4177 simon 875 ECB : */
3987 simon 876 CBC 4564 : size = offsetof(CheckpointerShmemStruct, requests);
3987 simon 877 GIC 4564 : size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest)));
4177 simon 878 ECB :
4177 simon 879 GIC 4564 : return size;
880 : }
881 :
882 : /*
883 : * CheckpointerShmemInit
884 : * Allocate and initialize checkpointer-related shared memory
885 : */
4177 simon 886 ECB : void
3987 simon 887 GIC 1826 : CheckpointerShmemInit(void)
4177 simon 888 ECB : {
3918 tgl 889 GIC 1826 : Size size = CheckpointerShmemSize();
890 : bool found;
4177 simon 891 ECB :
3987 simon 892 CBC 1826 : CheckpointerShmem = (CheckpointerShmemStruct *)
3986 tgl 893 GIC 1826 : ShmemInitStruct("Checkpointer Data",
894 : size,
895 : &found);
4177 simon 896 ECB :
4177 simon 897 GIC 1826 : if (!found)
898 : {
899 : /*
900 : * First time through, so initialize. Note that we zero the whole
901 : * requests array; this is so that CompactCheckpointerRequestQueue can
902 : * assume that any pad bytes in the request structs are zeroes.
3918 tgl 903 ECB : */
3918 tgl 904 CBC 2146 : MemSet(CheckpointerShmem, 0, size);
3987 simon 905 1826 : SpinLockInit(&CheckpointerShmem->ckpt_lck);
906 1826 : CheckpointerShmem->max_requests = NBuffers;
1487 tmunro 907 1826 : ConditionVariableInit(&CheckpointerShmem->start_cv);
1487 tmunro 908 GIC 1826 : ConditionVariableInit(&CheckpointerShmem->done_cv);
4177 simon 909 ECB : }
4177 simon 910 GIC 1826 : }
911 :
912 : /*
913 : * RequestCheckpoint
914 : * Called in backend processes to request a checkpoint
915 : *
916 : * flags is a bitwise OR of the following:
917 : * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
918 : * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
919 : * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
920 : * ignoring checkpoint_completion_target parameter.
921 : * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
922 : * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
923 : * CHECKPOINT_END_OF_RECOVERY).
924 : * CHECKPOINT_WAIT: wait for completion before returning (otherwise,
925 : * just signal checkpointer to do it, and return).
926 : * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
927 : * (This affects logging, and in particular enables CheckPointWarning.)
928 : */
4177 simon 929 ECB : void
4177 simon 930 GIC 1659 : RequestCheckpoint(int flags)
931 : {
932 : int ntries;
933 : int old_failed,
934 : old_started;
935 :
936 : /*
937 : * If in a standalone backend, just do it ourselves.
4177 simon 938 ECB : */
4177 simon 939 GIC 1659 : if (!IsPostmasterEnvironment)
940 : {
941 : /*
942 : * There's no point in doing slow checkpoints in a standalone backend,
943 : * because there's no other backends the checkpoint could disrupt.
4177 simon 944 ECB : */
4177 simon 945 GIC 1221 : CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
946 :
947 : /*
948 : * After any checkpoint, close all smgr files. This is so we won't
949 : * hang onto smgr references to deleted files indefinitely.
4177 simon 950 ECB : */
4177 simon 951 GIC 1221 : smgrcloseall();
4177 simon 952 ECB :
4177 simon 953 GIC 1221 : return;
954 : }
955 :
956 : /*
957 : * Atomically set the request flags, and take a snapshot of the counters.
958 : * When we see ckpt_started > old_started, we know the flags we set here
959 : * have been seen by checkpointer.
960 : *
961 : * Note that we OR the flags with any existing flags, to avoid overriding
962 : * a "stronger" request by another backend. The flag senses must be
963 : * chosen to make this work!
4177 simon 964 ECB : */
2742 rhaas 965 GIC 438 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
4177 simon 966 ECB :
2742 rhaas 967 CBC 438 : old_failed = CheckpointerShmem->ckpt_failed;
968 438 : old_started = CheckpointerShmem->ckpt_started;
1482 tgl 969 GIC 438 : CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED);
4177 simon 970 ECB :
2742 rhaas 971 GIC 438 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
972 :
973 : /*
974 : * Send signal to request checkpoint. It's possible that the checkpointer
975 : * hasn't started yet, or is in process of restarting, so we will retry a
976 : * few times if needed. (Actually, more than a few times, since on slow
977 : * or overloaded buildfarm machines, it's been observed that the
978 : * checkpointer can take several seconds to start.) However, if not told
979 : * to wait for the checkpoint to occur, we consider failure to send the
980 : * signal to be nonfatal and merely LOG it. The checkpointer should see
981 : * the request when it does start, with or without getting a signal.
982 : */
1482 tgl 983 ECB : #define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */
4177 simon 984 GIC 442 : for (ntries = 0;; ntries++)
4177 simon 985 ECB : {
3987 simon 986 GIC 442 : if (CheckpointerShmem->checkpointer_pid == 0)
4177 simon 987 ECB : {
1482 tgl 988 GIC 4 : if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
4177 simon 989 EUB : {
4177 simon 990 UIC 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1482 tgl 991 EUB : "could not signal for checkpoint: checkpointer is not running");
4177 simon 992 UIC 0 : break;
993 : }
4177 simon 994 ECB : }
3987 simon 995 GIC 438 : else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0)
4177 simon 996 EUB : {
1482 tgl 997 UIC 0 : if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
4177 simon 998 EUB : {
4177 simon 999 UIC 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
4177 simon 1000 EUB : "could not signal for checkpoint: %m");
4177 simon 1001 UIC 0 : break;
1002 : }
1003 : }
4177 simon 1004 ECB : else
4177 simon 1005 GIC 438 : break; /* signal sent successfully */
4177 simon 1006 ECB :
4177 simon 1007 CBC 4 : CHECK_FOR_INTERRUPTS();
4177 simon 1008 GIC 4 : pg_usleep(100000L); /* wait 0.1 sec, then retry */
1009 : }
1010 :
1011 : /*
1012 : * If requested, wait for completion. We detect completion according to
1013 : * the algorithm given above.
4177 simon 1014 ECB : */
4177 simon 1015 GIC 438 : if (flags & CHECKPOINT_WAIT)
1016 : {
1017 : int new_started,
1018 : new_failed;
1019 :
4177 simon 1020 ECB : /* Wait for a new checkpoint to start. */
1487 tmunro 1021 GIC 335 : ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv);
1022 : for (;;)
4177 simon 1023 ECB : {
2742 rhaas 1024 CBC 551 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1025 551 : new_started = CheckpointerShmem->ckpt_started;
2742 rhaas 1026 GIC 551 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
4177 simon 1027 ECB :
4177 simon 1028 CBC 551 : if (new_started != old_started)
4177 simon 1029 GIC 335 : break;
4177 simon 1030 ECB :
1487 tmunro 1031 GIC 216 : ConditionVariableSleep(&CheckpointerShmem->start_cv,
1032 : WAIT_EVENT_CHECKPOINT_START);
4177 simon 1033 ECB : }
1487 tmunro 1034 GIC 335 : ConditionVariableCancelSleep();
1035 :
1036 : /*
1037 : * We are waiting for ckpt_done >= new_started, in a modulo sense.
4177 simon 1038 ECB : */
1487 tmunro 1039 GIC 335 : ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv);
4177 simon 1040 ECB : for (;;)
4177 simon 1041 GIC 160 : {
1042 : int new_done;
4177 simon 1043 ECB :
2742 rhaas 1044 CBC 495 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1045 495 : new_done = CheckpointerShmem->ckpt_done;
1046 495 : new_failed = CheckpointerShmem->ckpt_failed;
2742 rhaas 1047 GIC 495 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
4177 simon 1048 ECB :
4177 simon 1049 CBC 495 : if (new_done - new_started >= 0)
4177 simon 1050 GIC 335 : break;
4177 simon 1051 ECB :
1487 tmunro 1052 GIC 160 : ConditionVariableSleep(&CheckpointerShmem->done_cv,
1053 : WAIT_EVENT_CHECKPOINT_DONE);
4177 simon 1054 ECB : }
1487 tmunro 1055 GIC 335 : ConditionVariableCancelSleep();
4177 simon 1056 ECB :
4177 simon 1057 GBC 335 : if (new_failed != old_failed)
4177 simon 1058 UIC 0 : ereport(ERROR,
1059 : (errmsg("checkpoint request failed"),
1060 : errhint("Consult recent messages in the server log for details.")));
1061 : }
1062 : }
1063 :
1064 : /*
1065 : * ForwardSyncRequest
1066 : * Forward a file-fsync request from a backend to the checkpointer
1067 : *
1068 : * Whenever a backend is compelled to write directly to a relation
1069 : * (which should be seldom, if the background writer is getting its job done),
1070 : * the backend calls this routine to pass over knowledge that the relation
1071 : * is dirty and must be fsync'd before next checkpoint. We also use this
1072 : * opportunity to count such writes for statistical purposes.
1073 : *
1074 : * To avoid holding the lock for longer than necessary, we normally write
1075 : * to the requests[] queue without checking for duplicates. The checkpointer
1076 : * will have to eliminate dups internally anyway. However, if we discover
1077 : * that the queue is full, we make a pass over the entire queue to compact
1078 : * it. This is somewhat expensive, but the alternative is for the backend
1079 : * to perform its own fsync, which is far more expensive in practice. It
1080 : * is theoretically possible a backend fsync might still be necessary, if
1081 : * the queue is full and contains no duplicate entries. In that case, we
1082 : * let the backend know by returning false.
1083 : */
4177 simon 1084 ECB : bool
1466 tmunro 1085 GIC 536554 : ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
1086 : {
1087 : CheckpointerRequest *request;
1088 : bool too_full;
4177 simon 1089 ECB :
4177 simon 1090 GBC 536554 : if (!IsUnderPostmaster)
4177 simon 1091 UIC 0 : return false; /* probably shouldn't even get here */
4177 simon 1092 ECB :
3917 tgl 1093 GBC 536554 : if (AmCheckpointerProcess())
1466 tmunro 1094 UIC 0 : elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
4177 simon 1095 ECB :
3987 simon 1096 GIC 536554 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1097 :
4177 simon 1098 ECB : /* Count all backend writes regardless of if they fit in the queue */
3917 tgl 1099 CBC 536554 : if (!AmBackgroundWriterProcess())
3917 tgl 1100 GIC 523069 : CheckpointerShmem->num_backend_writes++;
1101 :
1102 : /*
1103 : * If the checkpointer isn't running or the request queue is full, the
1104 : * backend will have to perform its own fsync request. But before forcing
1105 : * that to happen, we can try to compact the request queue.
4177 simon 1106 ECB : */
3987 simon 1107 CBC 536554 : if (CheckpointerShmem->checkpointer_pid == 0 ||
1108 536408 : (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
3988 tgl 1109 GIC 76 : !CompactCheckpointerRequestQueue()))
1110 : {
1111 : /*
1112 : * Count the subset of writes where backends have to do their own
1113 : * fsync
4177 simon 1114 ECB : */
3917 tgl 1115 CBC 147 : if (!AmBackgroundWriterProcess())
1116 48 : CheckpointerShmem->num_backend_fsync++;
3987 simon 1117 147 : LWLockRelease(CheckpointerCommLock);
4177 simon 1118 GIC 147 : return false;
1119 : }
1120 :
3988 tgl 1121 ECB : /* OK, insert request */
3987 simon 1122 CBC 536407 : request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
1466 tmunro 1123 536407 : request->ftag = *ftag;
1466 tmunro 1124 GIC 536407 : request->type = type;
1125 :
3988 tgl 1126 ECB : /* If queue is more than half full, nudge the checkpointer to empty it */
3987 simon 1127 CBC 536407 : too_full = (CheckpointerShmem->num_requests >=
3987 simon 1128 GIC 536407 : CheckpointerShmem->max_requests / 2);
3988 tgl 1129 ECB :
3987 simon 1130 GIC 536407 : LWLockRelease(CheckpointerCommLock);
1131 :
3988 tgl 1132 ECB : /* ... but not till after we release the lock */
3988 tgl 1133 CBC 536407 : if (too_full && ProcGlobal->checkpointerLatch)
3988 tgl 1134 GIC 15212 : SetLatch(ProcGlobal->checkpointerLatch);
3988 tgl 1135 ECB :
4177 simon 1136 GIC 536407 : return true;
1137 : }
1138 :
1139 : /*
1140 : * CompactCheckpointerRequestQueue
1141 : * Remove duplicates from the request queue to avoid backend fsyncs.
1142 : * Returns "true" if any entries were removed.
1143 : *
1144 : * Although a full fsync request queue is not common, it can lead to severe
1145 : * performance problems when it does happen. So far, this situation has
1146 : * only been observed to occur when the system is under heavy write load,
1147 : * and especially during the "sync" phase of a checkpoint. Without this
1148 : * logic, each backend begins doing an fsync for every block written, which
1149 : * gets very expensive and can slow down the whole system.
1150 : *
1151 : * Trying to do this every time the queue is full could lose if there
1152 : * aren't any removable entries. But that should be vanishingly rare in
1153 : * practice: there's one queue entry per shared buffer.
1154 : */
4177 simon 1155 ECB : static bool
3988 tgl 1156 GIC 76 : CompactCheckpointerRequestQueue(void)
1157 : {
1158 : struct CheckpointerSlotMapping
1159 : {
1160 : CheckpointerRequest request;
1161 : int slot;
1162 : };
1163 :
1164 : int n,
4177 simon 1165 ECB : preserve_count;
4177 simon 1166 GIC 76 : int num_skipped = 0;
1167 : HASHCTL ctl;
1168 : HTAB *htab;
1169 : bool *skip_slot;
1170 :
3987 simon 1171 ECB : /* must hold CheckpointerCommLock in exclusive mode */
3987 simon 1172 GIC 76 : Assert(LWLockHeldByMe(CheckpointerCommLock));
1173 :
3918 tgl 1174 ECB : /* Initialize skip_slot array */
3918 tgl 1175 GIC 76 : skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
1176 :
4177 simon 1177 ECB : /* Initialize temporary hash table */
3987 simon 1178 CBC 76 : ctl.keysize = sizeof(CheckpointerRequest);
3986 tgl 1179 76 : ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
3918 tgl 1180 GIC 76 : ctl.hcxt = CurrentMemoryContext;
3918 tgl 1181 ECB :
4087 simon 1182 CBC 76 : htab = hash_create("CompactCheckpointerRequestQueue",
3987 simon 1183 GIC 76 : CheckpointerShmem->num_requests,
1184 : &ctl,
1185 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1186 :
1187 : /*
1188 : * The basic idea here is that a request can be skipped if it's followed
1189 : * by a later, identical request. It might seem more sensible to work
1190 : * backwards from the end of the queue and check whether a request is
1191 : * *preceded* by an earlier, identical request, in the hopes of doing less
1192 : * copying. But that might change the semantics, if there's an
1193 : * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it
1194 : * this way. It would be possible to be even smarter if we made the code
1195 : * below understand the specific semantics of such requests (it could blow
1196 : * away preceding entries that would end up being canceled anyhow), but
1197 : * it's not clear that the extra complexity would buy us anything.
4177 simon 1198 ECB : */
3918 tgl 1199 GIC 7020 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
1200 : {
1201 : CheckpointerRequest *request;
1202 : struct CheckpointerSlotMapping *slotmap;
1203 : bool found;
1204 :
1205 : /*
1206 : * We use the request struct directly as a hashtable key. This
1207 : * assumes that any padding bytes in the structs are consistently the
1208 : * same, which should be okay because we zeroed them in
1209 : * CheckpointerShmemInit. Note also that RelFileLocator had better
1210 : * contain no pad bytes.
3918 tgl 1211 ECB : */
3987 simon 1212 CBC 6944 : request = &CheckpointerShmem->requests[n];
4177 1213 6944 : slotmap = hash_search(htab, request, HASH_ENTER, &found);
4177 simon 1214 GIC 6944 : if (found)
1215 : {
3918 tgl 1216 ECB : /* Duplicate, so mark the previous occurrence as skippable */
4177 simon 1217 CBC 4216 : skip_slot[slotmap->slot] = true;
3918 tgl 1218 GIC 4216 : num_skipped++;
1219 : }
3918 tgl 1220 ECB : /* Remember slot containing latest occurrence of this request value */
4177 simon 1221 GIC 6944 : slotmap->slot = n;
1222 : }
1223 :
4177 simon 1224 ECB : /* Done with the hash table. */
4177 simon 1225 GIC 76 : hash_destroy(htab);
1226 :
4177 simon 1227 ECB : /* If no duplicates, we're out of luck. */
4177 simon 1228 GIC 76 : if (!num_skipped)
4177 simon 1229 ECB : {
4177 simon 1230 CBC 1 : pfree(skip_slot);
4177 simon 1231 GIC 1 : return false;
1232 : }
1233 :
4177 simon 1234 ECB : /* We found some duplicates; remove them. */
3918 tgl 1235 CBC 75 : preserve_count = 0;
3918 tgl 1236 GIC 6891 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
4177 simon 1237 ECB : {
4177 simon 1238 CBC 6816 : if (skip_slot[n])
1239 4216 : continue;
3987 simon 1240 GIC 2600 : CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
4177 simon 1241 ECB : }
4177 simon 1242 GIC 75 : ereport(DEBUG1,
1243 : (errmsg_internal("compacted fsync request queue from %d entries to %d entries",
697 tgl 1244 ECB : CheckpointerShmem->num_requests, preserve_count)));
3987 simon 1245 GIC 75 : CheckpointerShmem->num_requests = preserve_count;
1246 :
4177 simon 1247 ECB : /* Cleanup. */
4177 simon 1248 CBC 75 : pfree(skip_slot);
4177 simon 1249 GIC 75 : return true;
1250 : }
1251 :
1252 : /*
1253 : * AbsorbSyncRequests
1254 : * Retrieve queued sync requests and pass them to sync mechanism.
1255 : *
1256 : * This is exported because it must be called during CreateCheckPoint;
1257 : * we have to be sure we have accepted all pending requests just before
1258 : * we start fsync'ing. Since CreateCheckPoint sometimes runs in
1259 : * non-checkpointer processes, do nothing if not checkpointer.
1260 : */
4177 simon 1261 ECB : void
1466 tmunro 1262 GIC 14568 : AbsorbSyncRequests(void)
4177 simon 1263 ECB : {
3987 simon 1264 GIC 14568 : CheckpointerRequest *requests = NULL;
1265 : CheckpointerRequest *request;
1266 : int n;
4177 simon 1267 ECB :
3917 tgl 1268 CBC 14568 : if (!AmCheckpointerProcess())
4177 simon 1269 GIC 3680 : return;
4177 simon 1270 ECB :
3987 simon 1271 GIC 10888 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1272 :
3988 tgl 1273 ECB : /* Transfer stats counts into pending pgstats message */
368 andres 1274 CBC 10888 : PendingCheckpointerStats.buf_written_backend
613 1275 10888 : += CheckpointerShmem->num_backend_writes;
368 1276 10888 : PendingCheckpointerStats.buf_fsync_backend
613 andres 1277 GIC 10888 : += CheckpointerShmem->num_backend_fsync;
4177 simon 1278 ECB :
3987 simon 1279 CBC 10888 : CheckpointerShmem->num_backend_writes = 0;
3987 simon 1280 GIC 10888 : CheckpointerShmem->num_backend_fsync = 0;
1281 :
1282 : /*
1283 : * We try to avoid holding the lock for a long time by copying the request
1284 : * array, and processing the requests after releasing the lock.
1285 : *
1286 : * Once we have cleared the requests from shared memory, we have to PANIC
1287 : * if we then fail to absorb them (eg, because our hashtable runs out of
1288 : * memory). This is because the system cannot run safely if we are unable
1289 : * to fsync what we have been told to fsync. Fortunately, the hashtable
1290 : * is so small that the problem is quite unlikely to arise in practice.
3205 heikki.linnakangas 1291 ECB : */
3987 simon 1292 CBC 10888 : n = CheckpointerShmem->num_requests;
4177 simon 1293 GIC 10888 : if (n > 0)
4177 simon 1294 ECB : {
3987 simon 1295 CBC 6022 : requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
3987 simon 1296 GIC 6022 : memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
1297 : }
3205 heikki.linnakangas 1298 ECB :
3205 heikki.linnakangas 1299 GIC 10888 : START_CRIT_SECTION();
3205 heikki.linnakangas 1300 ECB :
3987 simon 1301 GIC 10888 : CheckpointerShmem->num_requests = 0;
4177 simon 1302 ECB :
3987 simon 1303 GIC 10888 : LWLockRelease(CheckpointerCommLock);
4177 simon 1304 ECB :
4177 simon 1305 CBC 477287 : for (request = requests; n > 0; request++, n--)
1466 tmunro 1306 GIC 466399 : RememberSyncRequest(&request->ftag, request->type);
4177 simon 1307 ECB :
3205 heikki.linnakangas 1308 GIC 10888 : END_CRIT_SECTION();
3205 heikki.linnakangas 1309 ECB :
4177 simon 1310 CBC 10888 : if (requests)
4177 simon 1311 GIC 6022 : pfree(requests);
1312 : }
1313 :
1314 : /*
1315 : * Update any shared memory configurations based on config parameters
1316 : */
4092 simon 1317 ECB : static void
4092 simon 1318 GIC 391 : UpdateSharedMemoryConfig(void)
1319 : {
4092 simon 1320 ECB : /* update global shmem state for sync rep */
4092 simon 1321 GIC 391 : SyncRepUpdateSyncStandbysDefined();
1322 :
1323 : /*
1324 : * If full_page_writes has been changed by SIGHUP, we update it in shared
1325 : * memory and write an XLOG_FPW_CHANGE record.
4092 simon 1326 ECB : */
4092 simon 1327 GIC 391 : UpdateFullPageWrites();
4092 simon 1328 ECB :
4092 simon 1329 CBC 391 : elog(DEBUG2, "checkpointer updated shared memory configuration values");
4092 simon 1330 GIC 391 : }
1331 :
1332 : /*
1333 : * FirstCallSinceLastCheckpoint allows a process to take an action once
1334 : * per checkpoint cycle by asynchronously checking for checkpoint completion.
1335 : */
3964 simon 1336 ECB : bool
3964 simon 1337 GIC 10438 : FirstCallSinceLastCheckpoint(void)
1338 : {
1339 : static int ckpt_done = 0;
3955 bruce 1340 ECB : int new_done;
3955 bruce 1341 GIC 10438 : bool FirstCall = false;
3964 simon 1342 ECB :
2742 rhaas 1343 CBC 10438 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1344 10438 : new_done = CheckpointerShmem->ckpt_done;
2742 rhaas 1345 GIC 10438 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
3964 simon 1346 ECB :
3964 simon 1347 CBC 10438 : if (new_done != ckpt_done)
3964 simon 1348 GIC 152 : FirstCall = true;
3964 simon 1349 ECB :
3964 simon 1350 GIC 10438 : ckpt_done = new_done;
3964 simon 1351 ECB :
3964 simon 1352 GIC 10438 : return FirstCall;
1353 : }
|