Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * standby.c
4 : * Misc functions used in Hot Standby mode.
5 : *
6 : * All functions for handling RM_STANDBY_ID, which relate to
7 : * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8 : * Plus conflict recovery processing.
9 : *
10 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : * IDENTIFICATION
14 : * src/backend/storage/ipc/standby.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 : #include "access/transam.h"
20 : #include "access/twophase.h"
21 : #include "access/xact.h"
22 : #include "access/xloginsert.h"
23 : #include "access/xlogrecovery.h"
24 : #include "access/xlogutils.h"
25 : #include "miscadmin.h"
26 : #include "pgstat.h"
27 : #include "replication/slot.h"
28 : #include "storage/bufmgr.h"
29 : #include "storage/lmgr.h"
30 : #include "storage/proc.h"
31 : #include "storage/procarray.h"
32 : #include "storage/sinvaladt.h"
33 : #include "storage/standby.h"
34 : #include "utils/hsearch.h"
35 : #include "utils/memutils.h"
36 : #include "utils/ps_status.h"
37 : #include "utils/timeout.h"
38 : #include "utils/timestamp.h"
39 :
40 : /* User-settable GUC parameters */
41 : int vacuum_defer_cleanup_age;
42 : int max_standby_archive_delay = 30 * 1000;
43 : int max_standby_streaming_delay = 30 * 1000;
44 : bool log_recovery_conflict_waits = false;
45 :
46 : /*
47 : * Keep track of all the exclusive locks owned by original transactions.
48 : * For each known exclusive lock, there is a RecoveryLockEntry in the
49 : * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
50 : * given XID are chained together so that we can find them easily.
51 : * For each original transaction that is known to have any such locks,
52 : * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
53 : * which stores the head of the chain of its locks.
54 : */
55 : typedef struct RecoveryLockEntry
56 : {
57 : xl_standby_lock key; /* hash key: xid, dbOid, relOid */
58 : struct RecoveryLockEntry *next; /* chain link */
59 : } RecoveryLockEntry;
60 :
61 : typedef struct RecoveryLockXidEntry
62 : {
63 : TransactionId xid; /* hash key -- must be first */
64 : struct RecoveryLockEntry *head; /* chain head */
65 : } RecoveryLockXidEntry;
66 :
67 : static HTAB *RecoveryLockHash = NULL;
68 : static HTAB *RecoveryLockXidHash = NULL;
69 :
70 : /* Flags set by timeout handlers */
71 : static volatile sig_atomic_t got_standby_deadlock_timeout = false;
72 : static volatile sig_atomic_t got_standby_delay_timeout = false;
73 : static volatile sig_atomic_t got_standby_lock_timeout = false;
74 :
75 : static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
76 : ProcSignalReason reason,
77 : uint32 wait_event_info,
78 : bool report_waiting);
79 : static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
80 : static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
81 : static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
82 : static const char *get_recovery_conflict_desc(ProcSignalReason reason);
83 :
84 : /*
85 : * InitRecoveryTransactionEnvironment
86 : * Initialize tracking of our primary's in-progress transactions.
87 : *
88 : * We need to issue shared invalidations and hold locks. Holding locks
89 : * means others may want to wait on us, so we need to make a lock table
90 : * vxact entry like a real transaction. We could create and delete
91 : * lock table entries for each transaction but its simpler just to create
92 : * one permanent entry and leave it there all the time. Locks are then
93 : * acquired and released as needed. Yes, this means you can see the
94 : * Startup process in pg_locks once we have run this.
95 : */
96 : void
4859 simon 97 GIC 71 : InitRecoveryTransactionEnvironment(void)
98 : {
99 : VirtualTransactionId vxid;
100 : HASHCTL hash_ctl;
101 :
185 tgl 102 GNC 71 : Assert(RecoveryLockHash == NULL); /* don't run this twice */
103 :
104 : /*
105 : * Initialize the hash tables for tracking the locks held by each
106 : * transaction.
107 : */
108 71 : hash_ctl.keysize = sizeof(xl_standby_lock);
109 71 : hash_ctl.entrysize = sizeof(RecoveryLockEntry);
110 71 : RecoveryLockHash = hash_create("RecoveryLockHash",
111 : 64,
112 : &hash_ctl,
113 : HASH_ELEM | HASH_BLOBS);
1748 tmunro 114 GIC 71 : hash_ctl.keysize = sizeof(TransactionId);
185 tgl 115 GNC 71 : hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
116 71 : RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
117 : 64,
118 : &hash_ctl,
119 : HASH_ELEM | HASH_BLOBS);
120 :
121 : /*
122 : * Initialize shared invalidation management for Startup process, being
123 : * careful to register ourselves as a sendOnly process so we don't need to
1036 peter 124 ECB : * read messages, nor will we get signaled when the queue starts filling
125 : * up.
126 : */
4859 simon 127 GIC 71 : SharedInvalBackendInit(true);
128 :
129 : /*
4859 simon 130 ECB : * Lock a virtual transaction id for Startup process.
131 : *
132 : * We need to do GetNextLocalTransactionId() because
133 : * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
134 : * manager doesn't like that at all.
135 : *
136 : * Note that we don't need to run XactLockTableInsert() because nobody
137 : * needs to wait on xids. That sounds a little strange, but table locks
138 : * are held by vxids and row level locks are held by xids. All queries
139 : * hold AccessShareLocks so never block while we write or lock new rows.
140 : */
4859 simon 141 GIC 71 : vxid.backendId = MyBackendId;
142 71 : vxid.localTransactionId = GetNextLocalTransactionId();
143 71 : VirtualXactLockTableInsert(vxid);
144 :
145 71 : standbyState = STANDBY_INITIALIZED;
146 71 : }
147 :
148 : /*
4859 simon 149 ECB : * ShutdownRecoveryTransactionEnvironment
150 : * Shut down transaction tracking
151 : *
152 : * Prepare to switch from hot standby mode to normal operation. Shut down
153 : * recovery-time transaction tracking.
154 : *
155 : * This must be called even in shutdown of startup process if transaction
156 : * tracking has been initialized. Otherwise some locks the tracked
157 : * transactions were holding will not be released and may interfere with
158 : * the processes still running (but will exit soon later) at the exit of
159 : * startup process.
160 : */
161 : void
4859 simon 162 GIC 110 : ShutdownRecoveryTransactionEnvironment(void)
4859 simon 163 ECB : {
733 fujii 164 : /*
165 : * Do nothing if RecoveryLockHash is NULL because that means that
166 : * transaction tracking has not yet been initialized or has already been
167 : * shut down. This makes it safe to have possibly-redundant calls of this
168 : * function during process exit.
169 : */
185 tgl 170 GNC 110 : if (RecoveryLockHash == NULL)
733 fujii 171 GIC 39 : return;
172 :
173 : /* Mark all tracked in-progress transactions as finished. */
4859 simon 174 71 : ExpireAllKnownAssignedTransactionIds();
175 :
176 : /* Release all locks the tracked transactions were holding */
177 71 : StandbyReleaseAllLocks();
178 :
179 : /* Destroy the lock hash tables. */
185 tgl 180 GNC 71 : hash_destroy(RecoveryLockHash);
181 71 : hash_destroy(RecoveryLockXidHash);
182 71 : RecoveryLockHash = NULL;
183 71 : RecoveryLockXidHash = NULL;
184 :
185 : /* Cleanup our VirtualTransaction */
3783 simon 186 CBC 71 : VirtualXactLockTableCleanup();
187 : }
188 :
189 :
190 : /*
191 : * -----------------------------------------------------
192 : * Standby wait timers and backend cancel logic
193 : * -----------------------------------------------------
4859 simon 194 ECB : */
195 :
196 : /*
197 : * Determine the cutoff time at which we want to start canceling conflicting
4663 tgl 198 : * transactions. Returns zero (a time safely in the past) if we are willing
199 : * to wait forever.
200 : */
201 : static TimestampTz
4663 tgl 202 GIC 29 : GetStandbyLimitTime(void)
203 : {
4660 bruce 204 ECB : TimestampTz rtime;
4663 tgl 205 : bool fromStream;
206 :
207 : /*
208 : * The cutoff time is the last WAL data receipt time plus the appropriate
209 : * delay variable. Delay of -1 means wait forever.
210 : */
4663 tgl 211 GIC 29 : GetXLogReceiptTime(&rtime, &fromStream);
212 29 : if (fromStream)
213 : {
214 29 : if (max_standby_streaming_delay < 0)
4663 tgl 215 UIC 0 : return 0; /* wait forever */
4663 tgl 216 GIC 29 : return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
217 : }
218 : else
219 : {
4663 tgl 220 UIC 0 : if (max_standby_archive_delay < 0)
221 0 : return 0; /* wait forever */
222 0 : return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
223 : }
224 : }
225 :
4859 simon 226 ECB : #define STANDBY_INITIAL_WAIT_US 1000
227 : static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
228 :
229 : /*
230 : * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
231 : * We wait here for a while then return. If we decide we can't wait any
232 : * more then we return true, if we can wait some more return false.
233 : */
234 : static bool
1101 fujii 235 CBC 15 : WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
4859 simon 236 ECB : {
237 : TimestampTz ltime;
4663 tgl 238 :
2264 simon 239 GBC 15 : CHECK_FOR_INTERRUPTS();
2264 simon 240 ECB :
241 : /* Are we past the limit time? */
4663 tgl 242 GIC 15 : ltime = GetStandbyLimitTime();
243 15 : if (ltime && GetCurrentTimestamp() >= ltime)
4859 simon 244 GBC 3 : return true;
4859 simon 245 EUB :
246 : /*
247 : * Sleep a bit (this is essential to avoid busy-waiting).
248 : */
1101 fujii 249 GIC 12 : pgstat_report_wait_start(wait_event_info);
4859 simon 250 12 : pg_usleep(standbyWait_us);
1101 fujii 251 12 : pgstat_report_wait_end();
252 :
253 : /*
254 : * Progressively increase the sleep times, but not to more than 1s, since
255 : * pg_usleep isn't interruptible on some platforms.
256 : */
4859 simon 257 12 : standbyWait_us *= 2;
258 12 : if (standbyWait_us > 1000000)
4859 simon 259 LBC 0 : standbyWait_us = 1000000;
260 :
4859 simon 261 GIC 12 : return false;
262 : }
4859 simon 263 ECB :
264 : /*
265 : * Log the recovery conflict.
821 fujii 266 : *
267 : * wait_start is the timestamp when the caller started to wait.
268 : * now is the timestamp when this function has been called.
269 : * wait_list is the list of virtual transaction ids assigned to
270 : * conflicting processes. still_waiting indicates whether
271 : * the startup process is still waiting for the recovery conflict
272 : * to be resolved or not.
273 : */
274 : void
821 fujii 275 CBC 10 : LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
276 : TimestampTz now, VirtualTransactionId *wait_list,
277 : bool still_waiting)
278 : {
279 : long secs;
280 : int usecs;
821 fujii 281 ECB : long msecs;
282 : StringInfoData buf;
821 fujii 283 GBC 10 : int nprocs = 0;
284 :
816 fujii 285 ECB : /*
286 : * There must be no conflicting processes when the recovery conflict has
287 : * already been resolved.
288 : */
816 fujii 289 GIC 10 : Assert(still_waiting || wait_list == NULL);
290 :
821 291 10 : TimestampDifference(wait_start, now, &secs, &usecs);
292 10 : msecs = secs * 1000 + usecs / 1000;
293 10 : usecs = usecs % 1000;
294 :
295 10 : if (wait_list)
296 : {
297 : VirtualTransactionId *vxids;
298 :
821 fujii 299 ECB : /* Construct a string of list of the conflicting processes */
821 fujii 300 GIC 3 : vxids = wait_list;
301 6 : while (VirtualTransactionIdIsValid(*vxids))
302 : {
303 3 : PGPROC *proc = BackendIdGetProc(vxids->backendId);
304 :
305 : /* proc can be NULL if the target backend is not active */
306 3 : if (proc)
821 fujii 307 ECB : {
821 fujii 308 GIC 3 : if (nprocs == 0)
309 : {
310 3 : initStringInfo(&buf);
311 3 : appendStringInfo(&buf, "%d", proc->pid);
312 : }
821 fujii 313 ECB : else
821 fujii 314 UIC 0 : appendStringInfo(&buf, ", %d", proc->pid);
821 fujii 315 ECB :
821 fujii 316 CBC 3 : nprocs++;
821 fujii 317 ECB : }
318 :
821 fujii 319 CBC 3 : vxids++;
320 : }
321 : }
322 :
323 : /*
821 fujii 324 ECB : * If wait_list is specified, report the list of PIDs of active
325 : * conflicting backends in a detail message. Note that if all the backends
326 : * in the list are not active, no detail message is logged.
327 : */
816 fujii 328 GIC 10 : if (still_waiting)
329 : {
816 fujii 330 CBC 5 : ereport(LOG,
331 : errmsg("recovery still waiting after %ld.%03d ms: %s",
650 peter 332 ECB : msecs, usecs, get_recovery_conflict_desc(reason)),
333 : nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
816 fujii 334 : "Conflicting processes: %s.",
335 : nprocs, buf.data) : 0);
336 : }
337 : else
816 fujii 338 EUB : {
816 fujii 339 GIC 5 : ereport(LOG,
816 fujii 340 ECB : errmsg("recovery finished waiting after %ld.%03d ms: %s",
341 : msecs, usecs, get_recovery_conflict_desc(reason)));
342 : }
821 343 :
821 fujii 344 GIC 10 : if (nprocs > 0)
345 3 : pfree(buf.data);
346 10 : }
347 :
348 : /*
349 : * This is the main executioner for any query backend that conflicts with
350 : * recovery processing. Judgement has already been passed on it within
351 : * a specific rmgr. Here we just issue the orders to the procs. The procs
4859 simon 352 ECB : * then throw the required error as instructed.
353 : *
821 fujii 354 : * If report_waiting is true, "waiting" is reported in PS display and the
355 : * wait for recovery conflict is reported in the log, if necessary. If
356 : * the caller is responsible for reporting them, report_waiting should be
357 : * false. Otherwise, both the caller and this function report the same
358 : * thing unexpectedly.
359 : */
360 : static void
4859 simon 361 GIC 8626 : ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
362 : ProcSignalReason reason, uint32 wait_event_info,
1101 fujii 363 ECB : bool report_waiting)
364 : {
1125 fujii 365 GIC 8626 : TimestampTz waitStart = 0;
48 drowley 366 GNC 8626 : bool waiting = false;
821 fujii 367 GIC 8626 : bool logged_recovery_conflict = false;
4859 simon 368 ECB :
4496 rhaas 369 : /* Fast exit, to avoid a kernel call if there's no work to be done. */
4496 rhaas 370 CBC 8626 : if (!VirtualTransactionIdIsValid(*waitlist))
4496 rhaas 371 GIC 8623 : return;
372 :
373 : /* Set the wait start timestamp for reporting */
821 fujii 374 3 : if (report_waiting && (log_recovery_conflict_waits || update_process_title))
1125 375 2 : waitStart = GetCurrentTimestamp();
376 :
4496 rhaas 377 6 : while (VirtualTransactionIdIsValid(*waitlist))
378 : {
379 : /* reset standbyWait_us for each xact we wait for */
4859 simon 380 3 : standbyWait_us = STANDBY_INITIAL_WAIT_US;
381 :
382 : /* wait until the virtual xid is gone */
4266 rhaas 383 18 : while (!VirtualXactLock(*waitlist, false))
384 : {
4859 simon 385 ECB : /* Is it time to kill it? */
1101 fujii 386 GIC 15 : if (WaitExceedsMaxStandbyDelay(wait_event_info))
387 : {
388 : pid_t pid;
4859 simon 389 ECB :
390 : /*
391 : * Now find out who to throw out of the balloon.
392 : */
4859 simon 393 GIC 3 : Assert(VirtualTransactionIdIsValid(*waitlist));
4831 simon 394 CBC 3 : pid = CancelVirtualTransaction(*waitlist, reason);
4859 simon 395 ECB :
396 : /*
397 : * Wait a little bit for it to die so that we avoid flooding
4663 tgl 398 : * an unresponsive backend when system is heavily loaded.
4831 simon 399 : */
4859 simon 400 GIC 3 : if (pid != 0)
4725 tgl 401 CBC 3 : pg_usleep(5000L);
402 : }
403 :
48 drowley 404 GNC 15 : if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
405 : {
821 fujii 406 GIC 14 : TimestampTz now = 0;
821 fujii 407 ECB : bool maybe_log_conflict;
408 : bool maybe_update_title;
409 :
821 fujii 410 CBC 14 : maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
48 drowley 411 GNC 14 : maybe_update_title = (update_process_title && !waiting);
412 :
413 : /* Get the current timestamp if not report yet */
821 fujii 414 GIC 14 : if (maybe_log_conflict || maybe_update_title)
415 14 : now = GetCurrentTimestamp();
416 :
821 fujii 417 ECB : /*
418 : * Report via ps if we have been waiting for more than 500
419 : * msec (should that be configurable?)
420 : */
821 fujii 421 GIC 28 : if (maybe_update_title &&
422 14 : TimestampDifferenceExceeds(waitStart, now, 500))
423 : {
48 drowley 424 UNC 0 : set_ps_display_suffix("waiting");
425 0 : waiting = true;
426 : }
821 fujii 427 ECB :
428 : /*
429 : * Emit the log message if the startup process is waiting
430 : * longer than deadlock_timeout for recovery conflict.
431 : */
821 fujii 432 CBC 22 : if (maybe_log_conflict &&
821 fujii 433 GIC 8 : TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
434 : {
816 435 2 : LogRecoveryConflict(reason, waitStart, now, waitlist, true);
821 436 2 : logged_recovery_conflict = true;
437 : }
821 fujii 438 ECB : }
4859 simon 439 : }
440 :
4859 simon 441 EUB : /* The virtual transaction is gone now, wait for the next one */
4859 simon 442 GBC 3 : waitlist++;
443 : }
444 :
445 : /*
446 : * Emit the log message if recovery conflict was resolved but the startup
447 : * process waited longer than deadlock_timeout for it.
448 : */
816 fujii 449 CBC 3 : if (logged_recovery_conflict)
450 2 : LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
451 : NULL, false);
816 fujii 452 ECB :
453 : /* reset ps display to remove the suffix if we added one */
48 drowley 454 GNC 3 : if (waiting)
48 drowley 455 UNC 0 : set_ps_display_remove_suffix();
456 :
4859 simon 457 ECB : }
458 :
459 : /*
460 : * Generate whatever recovery conflicts are needed to eliminate snapshots that
461 : * might see XIDs <= snapshotConflictHorizon as still running.
462 : *
463 : * snapshotConflictHorizon cutoffs are our standard approach to generating
464 : * granular recovery conflicts. Note that InvalidTransactionId values are
465 : * interpreted as "definitely don't need any conflicts" here, which is a
466 : * general convention that WAL records can (and often do) depend on.
467 : */
468 : void
143 pg 469 GNC 10670 : ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
470 : bool isCatalogRel,
471 : RelFileLocator locator)
472 : {
473 : VirtualTransactionId *backends;
474 :
4736 simon 475 ECB : /*
830 pg 476 : * If we get passed InvalidTransactionId then we do nothing (no conflict).
477 : *
478 : * This can happen when replaying already-applied WAL records after a
479 : * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
480 : * record that marks as frozen a page which was already all-visible. It's
830 pg 481 EUB : * also quite common with records generated during index deletion
482 : * (original execution of the deletion can reason that a recovery conflict
483 : * which is sufficient for the deletion operation must take place before
484 : * replay of the deletion record itself).
485 : */
143 pg 486 GNC 10670 : if (!TransactionIdIsValid(snapshotConflictHorizon))
4528 simon 487 GIC 2046 : return;
488 :
97 pg 489 GNC 8624 : Assert(TransactionIdIsNormal(snapshotConflictHorizon));
143 490 8624 : backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
491 : locator.dbOid);
4833 simon 492 GIC 8624 : ResolveRecoveryConflictWithVirtualXIDs(backends,
493 : PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
494 : WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
1125 fujii 495 ECB : true);
496 :
497 : /*
498 : * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
499 : * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
500 : * seems OK, given that this kind of conflict should not normally be
501 : * reached, e.g. due to using a physical replication slot.
502 : */
2 andres 503 GNC 8624 : if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
504 19 : InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
505 : snapshotConflictHorizon);
506 : }
507 :
508 : /*
509 : * Variant of ResolveRecoveryConflictWithSnapshot that works with
510 : * FullTransactionId values
511 : */
512 : void
143 pg 513 UNC 0 : ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
514 : bool isCatalogRel,
515 : RelFileLocator locator)
516 : {
517 : /*
518 : * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
519 : * so truncate the logged FullTransactionId. If the logged value is very
520 : * old, so that XID wrap-around already happened on it, there can't be any
521 : * snapshots that still see it.
522 : */
774 pg 523 LBC 0 : FullTransactionId nextXid = ReadNextFullTransactionId();
697 tgl 524 ECB : uint64 diff;
525 :
774 pg 526 LBC 0 : diff = U64FromFullTransactionId(nextXid) -
143 pg 527 UNC 0 : U64FromFullTransactionId(snapshotConflictHorizon);
774 pg 528 UIC 0 : if (diff < MaxTransactionId / 2)
774 pg 529 ECB : {
530 : TransactionId truncated;
531 :
143 pg 532 UNC 0 : truncated = XidFromFullTransactionId(snapshotConflictHorizon);
2 andres 533 0 : ResolveRecoveryConflictWithSnapshot(truncated,
534 : isCatalogRel,
535 : locator);
536 : }
774 pg 537 UIC 0 : }
538 :
539 : void
4833 simon 540 GIC 1 : ResolveRecoveryConflictWithTablespace(Oid tsid)
541 : {
4833 simon 542 ECB : VirtualTransactionId *temp_file_users;
543 :
544 : /*
545 : * Standby users may be currently using this tablespace for their
546 : * temporary files. We only care about current users because
547 : * temp_tablespace parameter will just ignore tablespaces that no longer
548 : * exist.
549 : *
550 : * Ask everybody to cancel their queries immediately so we can ensure no
551 : * temp files remain and we can remove the tablespace. Nuke the entire
4790 bruce 552 EUB : * site from orbit, it's the only way to be sure.
553 : *
554 : * XXX: We could work out the pids of active backends using this
555 : * tablespace by examining the temp filenames in the directory. We would
556 : * then convert the pids into VirtualXIDs before attempting to cancel
557 : * them.
558 : *
559 : * We don't wait for commit because drop tablespace is non-transactional.
560 : */
4833 simon 561 GIC 1 : temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
4824 simon 562 EUB : InvalidOid);
4833 simon 563 GIC 1 : ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
564 : PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
1101 fujii 565 EUB : WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
1125 566 : true);
4833 simon 567 GBC 1 : }
568 :
569 : void
4833 simon 570 GIC 9 : ResolveRecoveryConflictWithDatabase(Oid dbid)
4833 simon 571 EUB : {
572 : /*
573 : * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
574 : * only waits for transactions and completely idle sessions would block
575 : * us. This is rare enough that we do this as simply as possible: no wait,
4790 bruce 576 : * just force them off immediately.
577 : *
578 : * No locking is required here because we already acquired
4790 bruce 579 ECB : * AccessExclusiveLock. Anybody trying to connect while we do this will
580 : * block during InitPostgres() and then disconnect when they see the
581 : * database has been removed.
582 : */
4833 simon 583 GIC 11 : while (CountDBBackends(dbid) > 0)
584 : {
4805 585 2 : CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
586 :
587 : /*
588 : * Wait awhile for them to die so that we avoid flooding an
589 : * unresponsive backend when system is heavily loaded.
590 : */
4833 591 2 : pg_usleep(10000);
592 : }
593 9 : }
594 :
595 : /*
596 : * ResolveRecoveryConflictWithLock is called from ProcSleep()
597 : * to resolve conflicts with other backends holding relation locks.
598 : *
599 : * The WaitLatch sleep normally done in ProcSleep()
2586 simon 600 ECB : * (when not InHotStandby) is performed here, for code clarity.
601 : *
602 : * We either resolve conflicts immediately or set a timeout to wake us at
603 : * the limit of our patience.
604 : *
605 : * Resolve conflicts by canceling to all backends holding a conflicting
606 : * lock. As we are already queued to be granted the lock, no new lock
607 : * requests conflicting with ours will be granted in the meantime.
608 : *
823 fujii 609 : * We also must check for deadlocks involving the Startup process and
610 : * hot-standby backend processes. If deadlock_timeout is reached in
611 : * this function, all the backends holding the conflicting locks are
612 : * requested to check themselves for deadlocks.
613 : *
614 : * logging_conflict should be true if the recovery conflict has not been
615 : * logged yet even though logging is enabled. After deadlock_timeout is
616 : * reached and the request for deadlock check is sent, we wait again to
617 : * be signaled by the release of the lock if logging_conflict is false.
618 : * Otherwise we return without waiting again so that the caller can report
619 : * the recovery conflict. In this case, then, this function is called again
620 : * with logging_conflict=false (because the recovery conflict has already
621 : * been logged) and we will wait again for the lock to be released.
2586 simon 622 : */
623 : void
821 fujii 624 CBC 3 : ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
625 : {
626 : TimestampTz ltime;
627 : TimestampTz now;
628 :
2586 simon 629 GIC 3 : Assert(InHotStandby);
4833 simon 630 ECB :
2586 simon 631 GIC 3 : ltime = GetStandbyLimitTime();
783 fujii 632 CBC 3 : now = GetCurrentTimestamp();
633 :
634 : /*
635 : * Update waitStart if first time through after the startup process
636 : * started waiting for the lock. It should not be updated every time
637 : * ResolveRecoveryConflictWithLock() is called during the wait.
638 : *
639 : * Use the current time obtained for comparison with ltime as waitStart
640 : * (i.e., the time when this process started waiting for the lock). Since
641 : * getting the current time newly can cause overhead, we reuse the
642 : * already-obtained time to avoid that overhead.
643 : *
644 : * Note that waitStart is updated without holding the lock table's
645 : * partition lock, to avoid the overhead by additional lock acquisition.
646 : * This can cause "waitstart" in pg_locks to become NULL for a very short
647 : * period of time after the wait started even though "granted" is false.
648 : * This is OK in practice because we can assume that users are likely to
649 : * look at "waitstart" when waiting for the lock for a long time.
650 : */
783 fujii 651 GIC 3 : if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
652 1 : pg_atomic_write_u64(&MyProc->waitStart, now);
653 :
654 3 : if (now >= ltime && ltime != 0)
2586 simon 655 1 : {
656 : /*
657 : * We're already behind, so clear a path as quickly as possible.
658 : */
659 : VirtualTransactionId *backends;
660 :
1468 alvherre 661 1 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
662 :
1125 fujii 663 ECB : /*
664 : * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
665 : * "waiting" in PS display by disabling its argument report_waiting
666 : * because the caller, WaitOnLock(), has already reported that.
667 : */
4833 simon 668 CBC 1 : ResolveRecoveryConflictWithVirtualXIDs(backends,
669 : PROCSIG_RECOVERY_CONFLICT_LOCK,
1101 fujii 670 1 : PG_WAIT_LOCK | locktag.locktag_type,
1125 fujii 671 ECB : false);
672 : }
673 : else
674 : {
675 : /*
676 : * Wait (or wait again) until ltime, and check for deadlocks as well
677 : * if we will be waiting longer than deadlock_timeout
678 : */
679 : EnableTimeoutParams timeouts[2];
823 fujii 680 GIC 2 : int cnt = 0;
681 :
682 2 : if (ltime != 0)
683 : {
684 2 : got_standby_lock_timeout = false;
685 2 : timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
686 2 : timeouts[cnt].type = TMPARAM_AT;
687 2 : timeouts[cnt].fin_time = ltime;
688 2 : cnt++;
689 : }
4833 simon 690 ECB :
823 fujii 691 CBC 2 : got_standby_deadlock_timeout = false;
823 fujii 692 GIC 2 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
823 fujii 693 CBC 2 : timeouts[cnt].type = TMPARAM_AFTER;
694 2 : timeouts[cnt].delay_ms = DeadlockTimeout;
823 fujii 695 GIC 2 : cnt++;
696 :
697 2 : enable_timeouts(timeouts, cnt);
698 : }
699 :
2586 simon 700 ECB : /* Wait to be signaled by the release of the Relation Lock */
2377 rhaas 701 GIC 3 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
702 :
703 : /*
704 : * Exit if ltime is reached. Then all the backends holding conflicting
705 : * locks will be canceled in the next ResolveRecoveryConflictWithLock()
706 : * call.
823 fujii 707 ECB : */
823 fujii 708 GIC 3 : if (got_standby_lock_timeout)
823 fujii 709 LBC 0 : goto cleanup;
710 :
823 fujii 711 GIC 3 : if (got_standby_deadlock_timeout)
712 : {
713 : VirtualTransactionId *backends;
714 :
715 2 : backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
716 :
717 : /* Quick exit if there's no work to be done */
718 2 : if (!VirtualTransactionIdIsValid(*backends))
823 fujii 719 LBC 0 : goto cleanup;
720 :
823 fujii 721 ECB : /*
722 : * Send signals to all the backends holding the conflicting locks, to
723 : * ask them to check themselves for deadlocks.
724 : */
823 fujii 725 CBC 4 : while (VirtualTransactionIdIsValid(*backends))
823 fujii 726 ECB : {
823 fujii 727 CBC 2 : SignalVirtualTransaction(*backends,
728 : PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
729 : false);
730 2 : backends++;
823 fujii 731 ECB : }
732 :
821 733 : /*
734 : * Exit if the recovery conflict has not been logged yet even though
735 : * logging is enabled, so that the caller can log that. Then
736 : * RecoveryConflictWithLock() is called again and we will wait again
737 : * for the lock to be released.
738 : */
821 fujii 739 GIC 2 : if (logging_conflict)
821 fujii 740 CBC 1 : goto cleanup;
741 :
742 : /*
743 : * Wait again here to be signaled by the release of the Relation Lock,
744 : * to prevent the subsequent RecoveryConflictWithLock() from causing
745 : * deadlock_timeout and sending a request for deadlocks check again.
746 : * Otherwise the request continues to be sent every deadlock_timeout
823 fujii 747 ECB : * until the relation locks are released or ltime is reached.
823 fujii 748 EUB : */
823 fujii 749 GIC 1 : got_standby_deadlock_timeout = false;
823 fujii 750 CBC 1 : ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
751 : }
752 :
823 fujii 753 GIC 1 : cleanup:
823 fujii 754 ECB :
755 : /*
756 : * Clear any timeout requests established above. We assume here that the
2586 simon 757 : * Startup process doesn't have any other outstanding timeouts than those
2586 simon 758 EUB : * used by this function. If that stops being true, we could cancel the
759 : * timeouts individually, but that'd be slower.
760 : */
2586 simon 761 GIC 3 : disable_all_timeouts(false);
823 fujii 762 3 : got_standby_lock_timeout = false;
763 3 : got_standby_deadlock_timeout = false;
4833 simon 764 CBC 3 : }
765 :
4824 simon 766 ECB : /*
767 : * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
768 : * to resolve conflicts with other backends holding buffer pins.
769 : *
770 : * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
771 : * (when not InHotStandby) is performed here, for code clarity.
772 : *
773 : * We either resolve conflicts immediately or set a timeout to wake us at
774 : * the limit of our patience.
775 : *
776 : * Resolve conflicts by sending a PROCSIG signal to all backends to check if
777 : * they hold one of the buffer pins that is blocking Startup process. If so,
3919 alvherre 778 : * those backends will take an appropriate error action, ERROR or FATAL.
4824 simon 779 : *
780 : * We also must check for deadlocks. Deadlocks occur because if queries
781 : * wait on a lock, that must be behind an AccessExclusiveLock, which can only
782 : * be cleared if the Startup process replays a transaction completion record.
783 : * If Startup process is also waiting then that is a deadlock. The deadlock
784 : * can occur if the query is waiting and then the Startup sleeps, or if
785 : * Startup is sleeping and the query waits on a lock. We protect against
786 : * only the former sequence here, the latter sequence is checked prior to
787 : * the query sleeping, in CheckRecoveryConflictDeadlock().
4663 tgl 788 : *
789 : * Deadlocks are extremely rare, and relatively expensive to check for,
790 : * so we don't do a deadlock check right away ... only if we have had to wait
791 : * at least deadlock_timeout.
4824 simon 792 : */
793 : void
4824 simon 794 GIC 11 : ResolveRecoveryConflictWithBufferPin(void)
795 : {
796 : TimestampTz ltime;
797 :
798 11 : Assert(InHotStandby);
799 :
4663 tgl 800 CBC 11 : ltime = GetStandbyLimitTime();
4701 simon 801 ECB :
823 fujii 802 CBC 11 : if (GetCurrentTimestamp() >= ltime && ltime != 0)
4663 tgl 803 ECB : {
804 : /*
805 : * We're already behind, so clear a path as quickly as possible.
806 : */
4663 tgl 807 GIC 1 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
808 : }
809 : else
810 : {
811 : /*
812 : * Wake up at ltime, and check for deadlocks as well if we will be
813 : * waiting longer than deadlock_timeout
814 : */
815 : EnableTimeoutParams timeouts[2];
823 fujii 816 10 : int cnt = 0;
817 :
818 10 : if (ltime != 0)
819 : {
820 10 : timeouts[cnt].id = STANDBY_TIMEOUT;
821 10 : timeouts[cnt].type = TMPARAM_AT;
822 10 : timeouts[cnt].fin_time = ltime;
823 10 : cnt++;
824 : }
825 :
826 10 : got_standby_deadlock_timeout = false;
827 10 : timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
828 10 : timeouts[cnt].type = TMPARAM_AFTER;
829 10 : timeouts[cnt].delay_ms = DeadlockTimeout;
830 10 : cnt++;
831 :
832 10 : enable_timeouts(timeouts, cnt);
4824 simon 833 ECB : }
834 :
835 : /*
836 : * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
342 andres 837 : * by one of the timeouts established above.
838 : *
843 fujii 839 : * We assume that only UnpinBuffer() and the timeout requests established
840 : * above can wake us up here. WakeupRecovery() called by walreceiver or
841 : * SIGHUP signal handler, etc cannot do that because it uses the different
842 : * latch from that ProcWaitForSignal() waits on.
843 : */
2377 rhaas 844 GIC 11 : ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
845 :
342 andres 846 CBC 11 : if (got_standby_delay_timeout)
342 andres 847 GIC 1 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
848 10 : else if (got_standby_deadlock_timeout)
849 : {
850 : /*
851 : * Send out a request for hot-standby backends to check themselves for
852 : * deadlocks.
853 : *
854 : * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
823 fujii 855 ECB : * to be signaled by UnpinBuffer() again and send a request for
856 : * deadlocks check if deadlock_timeout happens. This causes the
857 : * request to continue to be sent every deadlock_timeout until the
858 : * buffer is unpinned or ltime is reached. This would increase the
859 : * workload in the startup process and backends. In practice it may
860 : * not be so harmful because the period that the buffer is kept pinned
861 : * is basically no so long. But we should fix this?
862 : */
331 alvherre 863 GIC 6 : SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
864 : }
823 fujii 865 ECB :
3919 alvherre 866 : /*
3602 bruce 867 : * Clear any timeout requests established above. We assume here that the
868 : * Startup process doesn't have any other timeouts than what this function
869 : * uses. If that stops being true, we could cancel the timeouts
870 : * individually, but that'd be slower.
3919 alvherre 871 : */
3919 alvherre 872 GIC 11 : disable_all_timeouts(false);
342 andres 873 11 : got_standby_delay_timeout = false;
823 fujii 874 11 : got_standby_deadlock_timeout = false;
4824 simon 875 11 : }
876 :
877 : static void
4803 878 8 : SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
879 : {
880 8 : Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
881 : reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
882 :
4824 simon 883 ECB : /*
884 : * We send signal to all backends to ask them if they are holding the
4790 bruce 885 : * buffer pin which is delaying the Startup process. We must not set the
886 : * conflict flag yet, since most backends will be innocent. Let the
887 : * SIGUSR1 handling in each backend decide their own fate.
888 : */
4803 simon 889 GIC 8 : CancelDBBackends(InvalidOid, reason, false);
4824 890 8 : }
891 :
892 : /*
893 : * In Hot Standby perform early deadlock detection. We abort the lock
894 : * wait if we are about to sleep while holding the buffer pin that Startup
895 : * process is waiting for.
896 : *
897 : * Note: this code is pessimistic, because there is no way for it to
898 : * determine whether an actual deadlock condition is present: the lock we
899 : * need to wait for might be unrelated to any held by the Startup process.
900 : * Sooner or later, this mechanism should get ripped out in favor of somehow
901 : * accounting for buffer locks in DeadLockCheck(). However, errors here
4268 tgl 902 ECB : * seem to be very low-probability in practice, so for now it's not worth
903 : * the trouble.
904 : */
905 : void
4268 tgl 906 GIC 1 : CheckRecoveryConflictDeadlock(void)
907 : {
908 1 : Assert(!InRecovery); /* do not call in Startup process */
909 :
4816 simon 910 1 : if (!HoldingBufferPinThatDelaysRecovery())
4816 simon 911 CBC 1 : return;
4816 simon 912 ECB :
913 : /*
914 : * Error message should match ProcessInterrupts() but we avoid calling
915 : * that because we aren't handling an interrupt at this point. Note that
916 : * we only cancel the current transaction here, so if we are in a
917 : * subtransaction and the pin is held by a parent, then the Startup
918 : * process will continue to wait even though we have avoided deadlock.
919 : */
4816 simon 920 UIC 0 : ereport(ERROR,
921 : (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
922 : errmsg("canceling statement due to conflict with recovery"),
923 : errdetail("User transaction caused buffer deadlock with recovery.")));
924 : }
925 :
926 :
927 : /* --------------------------------
3919 alvherre 928 ECB : * timeout handler routines
929 : * --------------------------------
930 : */
931 :
932 : /*
933 : * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
934 : * exceeded.
935 : */
936 : void
3919 alvherre 937 GIC 8 : StandbyDeadLockHandler(void)
938 : {
823 fujii 939 8 : got_standby_deadlock_timeout = true;
3919 alvherre 940 8 : }
941 :
942 : /*
943 : * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
944 : */
3919 alvherre 945 ECB : void
3919 alvherre 946 GIC 1 : StandbyTimeoutHandler(void)
3919 alvherre 947 ECB : {
342 andres 948 GIC 1 : got_standby_delay_timeout = true;
3919 alvherre 949 CBC 1 : }
3919 alvherre 950 ECB :
951 : /*
952 : * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
953 : */
954 : void
2586 simon 955 GIC 1 : StandbyLockTimeoutHandler(void)
956 : {
823 fujii 957 1 : got_standby_lock_timeout = true;
2586 simon 958 1 : }
3919 alvherre 959 EUB :
960 : /*
961 : * -----------------------------------------------------
962 : * Locking in Recovery Mode
963 : * -----------------------------------------------------
964 : *
965 : * All locks are held by the Startup process using a single virtual
966 : * transaction. This implementation is both simpler and in some senses,
967 : * more correct. The locks held mean "some original transaction held
968 : * this lock, so query access is not allowed at this time". So the Startup
969 : * process is the proxy by which the original locks are implemented.
970 : *
971 : * We only keep track of AccessExclusiveLocks, which are only ever held by
972 : * one transaction on one relation.
973 : *
974 : * We keep a table of known locks in the RecoveryLockHash hash table.
975 : * The point of that table is to let us efficiently de-duplicate locks,
976 : * which is important because checkpoints will re-report the same locks
977 : * already held. There is also a RecoveryLockXidHash table with one entry
978 : * per xid, which allows us to efficiently find all the locks held by a
979 : * given original transaction.
980 : *
981 : * We use session locks rather than normal locks so we don't need
982 : * ResourceOwners.
983 : */
984 :
4859 simon 985 ECB :
986 : void
4859 simon 987 CBC 19239 : StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
4859 simon 988 ECB : {
989 : RecoveryLockXidEntry *xidentry;
990 : RecoveryLockEntry *lockentry;
991 : xl_standby_lock key;
992 : LOCKTAG locktag;
993 : bool found;
994 :
995 : /* Already processed? */
4094 simon 996 GIC 38478 : if (!TransactionIdIsValid(xid) ||
4094 simon 997 CBC 38470 : TransactionIdDidCommit(xid) ||
998 19231 : TransactionIdDidAbort(xid))
4859 simon 999 GIC 8 : return;
1000 :
1001 19231 : elog(trace_recovery(DEBUG4),
1002 : "adding recovery lock: db %u rel %u", dbOid, relOid);
1003 :
1004 : /* dbOid is InvalidOid when we are locking a shared relation. */
1005 19231 : Assert(OidIsValid(relOid));
1006 :
1007 : /* Create a hash entry for this xid, if we don't have one already. */
185 tgl 1008 GNC 19231 : xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
1748 tmunro 1009 GIC 19231 : if (!found)
1010 : {
185 tgl 1011 GNC 8761 : Assert(xidentry->xid == xid); /* dynahash should have set this */
1012 8761 : xidentry->head = NULL;
1013 : }
1014 :
1015 : /* Create a hash entry for this lock, unless we have one already. */
1016 19231 : key.xid = xid;
1017 19231 : key.dbOid = dbOid;
1018 19231 : key.relOid = relOid;
1019 19231 : lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
1020 19231 : if (!found)
1021 : {
1022 : /* It's new, so link it into the XID's list ... */
1023 19147 : lockentry->next = xidentry->head;
1024 19147 : xidentry->head = lockentry;
1025 :
1026 : /* ... and acquire the lock locally. */
1027 19147 : SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
1028 :
1029 19147 : (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
1030 : }
1031 : }
1032 :
1033 : /*
1034 : * Release all the locks associated with this RecoveryLockXidEntry.
1035 : */
1036 : static void
1037 8761 : StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
1038 : {
1039 : RecoveryLockEntry *entry;
1040 : RecoveryLockEntry *next;
1041 :
1042 27908 : for (entry = xidentry->head; entry != NULL; entry = next)
1043 : {
1044 : LOCKTAG locktag;
1045 :
1748 tmunro 1046 CBC 19147 : elog(trace_recovery(DEBUG4),
1748 tmunro 1047 ECB : "releasing recovery lock: xid %u db %u rel %u",
1048 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1049 : /* Release the lock ... */
185 tgl 1050 GNC 19147 : SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
1748 tmunro 1051 GIC 19147 : if (!LockRelease(&locktag, AccessExclusiveLock, true))
1748 tmunro 1052 ECB : {
1748 tmunro 1053 UIC 0 : elog(LOG,
1054 : "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
1055 : entry->key.xid, entry->key.dbOid, entry->key.relOid);
1748 tmunro 1056 LBC 0 : Assert(false);
1057 : }
1058 : /* ... and remove the per-lock hash entry */
185 tgl 1059 GNC 19147 : next = entry->next;
1060 19147 : hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
1061 : }
525 tgl 1062 ECB :
185 tgl 1063 GNC 8761 : xidentry->head = NULL; /* just for paranoia */
1748 tmunro 1064 GIC 8761 : }
4790 bruce 1065 ECB :
1066 : /*
1067 : * Release locks for specific XID, or all locks if it's InvalidXid.
1068 : */
1748 tmunro 1069 : static void
1748 tmunro 1070 GIC 9426 : StandbyReleaseLocks(TransactionId xid)
1071 : {
1072 : RecoveryLockXidEntry *entry;
4859 simon 1073 ECB :
1748 tmunro 1074 CBC 9426 : if (TransactionIdIsValid(xid))
1748 tmunro 1075 ECB : {
185 tgl 1076 GNC 9426 : if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
4859 simon 1077 ECB : {
185 tgl 1078 GNC 8761 : StandbyReleaseXidEntryLocks(entry);
1079 8761 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
4859 simon 1080 ECB : }
1081 : }
1082 : else
1748 tmunro 1083 UIC 0 : StandbyReleaseAllLocks();
4859 simon 1084 CBC 9426 : }
1085 :
4859 simon 1086 ECB : /*
1087 : * Release locks for a transaction tree, starting at xid down, from
1088 : * RecoveryLockXidHash.
1089 : *
1090 : * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
1091 : * to remove any AccessExclusiveLocks requested by a transaction.
1092 : */
1093 : void
4859 simon 1094 CBC 8932 : StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
1095 : {
1096 : int i;
1097 :
4859 simon 1098 GIC 8932 : StandbyReleaseLocks(xid);
4859 simon 1099 ECB :
4859 simon 1100 GIC 9426 : for (i = 0; i < nsubxids; i++)
1101 494 : StandbyReleaseLocks(subxids[i]);
1102 8932 : }
4859 simon 1103 ECB :
1104 : /*
1105 : * Called at end of recovery and when we see a shutdown checkpoint.
1106 : */
4094 1107 : void
4094 simon 1108 CBC 71 : StandbyReleaseAllLocks(void)
1109 : {
1744 andrew 1110 EUB : HASH_SEQ_STATUS status;
1111 : RecoveryLockXidEntry *entry;
1112 :
4094 simon 1113 GBC 71 : elog(trace_recovery(DEBUG2), "release all standby locks");
1114 :
185 tgl 1115 GNC 71 : hash_seq_init(&status, RecoveryLockXidHash);
1748 tmunro 1116 CBC 71 : while ((entry = hash_seq_search(&status)))
4094 simon 1117 ECB : {
185 tgl 1118 UNC 0 : StandbyReleaseXidEntryLocks(entry);
1119 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
4094 simon 1120 ECB : }
4094 simon 1121 CBC 71 : }
1122 :
1123 : /*
1124 : * StandbyReleaseOldLocks
1125 : * Release standby locks held by top-level XIDs that aren't running,
1126 : * as long as they're not prepared transactions.
4094 simon 1127 ECB : */
1128 : void
1758 simon 1129 GIC 195 : StandbyReleaseOldLocks(TransactionId oldxid)
1130 : {
1744 andrew 1131 ECB : HASH_SEQ_STATUS status;
1132 : RecoveryLockXidEntry *entry;
4859 simon 1133 :
185 tgl 1134 GNC 195 : hash_seq_init(&status, RecoveryLockXidHash);
1748 tmunro 1135 CBC 212 : while ((entry = hash_seq_search(&status)))
4859 simon 1136 ECB : {
1748 tmunro 1137 GIC 17 : Assert(TransactionIdIsValid(entry->xid));
1138 :
1139 : /* Skip if prepared transaction. */
1748 tmunro 1140 GBC 17 : if (StandbyTransactionIdIsPrepared(entry->xid))
1748 tmunro 1141 LBC 0 : continue;
1142 :
1143 : /* Skip if >= oldxid. */
1748 tmunro 1144 GIC 17 : if (!TransactionIdPrecedes(entry->xid, oldxid))
1145 17 : continue;
1146 :
1147 : /* Remove all locks and hash table entry. */
185 tgl 1148 UNC 0 : StandbyReleaseXidEntryLocks(entry);
1149 0 : hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1150 : }
4859 simon 1151 CBC 195 : }
1152 :
1153 : /*
1154 : * --------------------------------------------------------------------
4790 bruce 1155 ECB : * Recovery handling for Rmgr RM_STANDBY_ID
1156 : *
4859 simon 1157 : * These record types will only be created if XLogStandbyInfoActive()
1158 : * --------------------------------------------------------------------
1159 : */
1160 :
1161 : void
3062 heikki.linnakangas 1162 GIC 19992 : standby_redo(XLogReaderState *record)
1163 : {
1164 19992 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
4859 simon 1165 ECB :
1166 : /* Backup blocks are not used in standby records */
3062 heikki.linnakangas 1167 GIC 19992 : Assert(!XLogRecHasAnyBlockRefs(record));
1168 :
1169 : /* Do nothing if we're not in hot standby mode */
4859 simon 1170 CBC 19992 : if (standbyState == STANDBY_DISABLED)
4859 simon 1171 GIC 122 : return;
4859 simon 1172 ECB :
4859 simon 1173 CBC 19870 : if (info == XLOG_STANDBY_LOCK)
1174 : {
4859 simon 1175 GBC 19195 : xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
4790 bruce 1176 EUB : int i;
1177 :
4859 simon 1178 CBC 38434 : for (i = 0; i < xlrec->nlocks; i++)
4859 simon 1179 GIC 19239 : StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
1180 : xlrec->locks[i].dbOid,
1181 : xlrec->locks[i].relOid);
1182 : }
1183 675 : else if (info == XLOG_RUNNING_XACTS)
1184 : {
1185 160 : xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
4859 simon 1186 ECB : RunningTransactionsData running;
1187 :
4859 simon 1188 GIC 160 : running.xcnt = xlrec->xcnt;
3780 1189 160 : running.subxcnt = xlrec->subxcnt;
4859 1190 160 : running.subxid_overflow = xlrec->subxid_overflow;
4859 simon 1191 CBC 160 : running.nextXid = xlrec->nextXid;
4713 1192 160 : running.latestCompletedXid = xlrec->latestCompletedXid;
4859 simon 1193 GIC 160 : running.oldestRunningXid = xlrec->oldestRunningXid;
4859 simon 1194 CBC 160 : running.xids = xlrec->xids;
1195 :
4859 simon 1196 GIC 160 : ProcArrayApplyRecoveryInfo(&running);
4859 simon 1197 ECB : }
2542 andres 1198 GBC 515 : else if (info == XLOG_INVALIDATIONS)
1199 : {
2542 andres 1200 GIC 515 : xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
2542 andres 1201 ECB :
2542 andres 1202 CBC 515 : ProcessCommittedInvalidationMessages(xlrec->msgs,
1203 : xlrec->nmsgs,
2542 andres 1204 GIC 515 : xlrec->relcacheInitFileInval,
2542 andres 1205 EUB : xlrec->dbId,
1206 : xlrec->tsId);
1207 : }
4859 simon 1208 ECB : else
4082 tgl 1209 UIC 0 : elog(PANIC, "standby_redo: unknown op code %u", info);
1210 : }
1211 :
1212 : /*
1213 : * Log details of the current snapshot to WAL. This allows the snapshot state
1214 : * to be reconstructed on the standby and for logical decoding.
1215 : *
1216 : * This is used for Hot Standby as follows:
1217 : *
1218 : * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
4714 simon 1219 ECB : * start from a shutdown checkpoint because we know nothing was running
1220 : * at that time and our recovery snapshot is known empty. In the more
1221 : * typical case of an online checkpoint we need to jump through a few
1222 : * hoops to get a correct recovery snapshot and this requires a two or
1223 : * sometimes a three stage process.
1224 : *
1225 : * The initial snapshot must contain all running xids and all current
1226 : * AccessExclusiveLocks at a point in time on the standby. Assembling
1227 : * that information while the server is running requires many and
1228 : * various LWLocks, so we choose to derive that information piece by
1229 : * piece and then re-assemble that info on the standby. When that
1230 : * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
1231 : *
1232 : * Since locking on the primary when we derive the information is not
1233 : * strict, we note that there is a time window between the derivation and
1234 : * writing to WAL of the derived information. That allows race conditions
1235 : * that we must resolve, since xids and locks may enter or leave the
1236 : * snapshot during that window. This creates the issue that an xid or
1237 : * lock may start *after* the snapshot has been derived yet *before* the
1238 : * snapshot is logged in the running xacts WAL record. We resolve this by
1239 : * starting to accumulate changes at a point just prior to when we derive
1240 : * the snapshot on the primary, then ignore duplicates when we later apply
1241 : * the snapshot from the running xacts record. This is implemented during
447 akapila 1242 : * CreateCheckPoint() where we use the logical checkpoint location as
1243 : * our starting point and then write the running xacts record immediately
1244 : * before writing the main checkpoint WAL record. Since we always start
4714 simon 1245 : * up from a checkpoint and are immediately at our starting point, we
1246 : * unconditionally move to STANDBY_INITIALIZED. After this point we
1247 : * must do 4 things:
971 andres 1248 : * * move shared nextXid forwards as we see new xids
4660 bruce 1249 : * * extend the clog and subtrans with each new xid
1250 : * * keep track of uncommitted known assigned xids
1251 : * * keep track of uncommitted AccessExclusiveLocks
1252 : *
4714 simon 1253 : * When we see a commit/abort we must remove known assigned xids and locks
1254 : * from the completing transaction. Attempted removals that cannot locate
1255 : * an entry are expected and must not cause an error when we are in state
1256 : * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
1257 : * KnownAssignedXidsRemove().
1258 : *
1259 : * Later, when we apply the running xact data we must be careful to ignore
1260 : * transactions already committed, since those commits raced ahead when
1261 : * making WAL entries.
1262 : *
1263 : * The loose timing also means that locks may be recorded that have a
1264 : * zero xid, since xids are removed from procs before locks are removed.
1265 : * So we must prune the lock list down to ensure we hold locks only for
4094 simon 1266 EUB : * currently running xids, performed by StandbyReleaseOldLocks().
1267 : * Zero xids should no longer be possible, but we may be replaying WAL
1268 : * from a time when they were possible.
1269 : *
1270 : * For logical decoding only the running xacts information is needed;
1271 : * there's no need to look at the locking information, but it's logged anyway,
1272 : * as there's no independent knob to just enable logical decoding. For
1273 : * details of how this is used, check snapbuild.c's introductory comment.
1274 : *
1275 : *
1276 : * Returns the RecPtr of the last inserted record.
1277 : */
1278 : XLogRecPtr
3780 tgl 1279 GIC 1749 : LogStandbySnapshot(void)
1280 : {
1281 : XLogRecPtr recptr;
1282 : RunningTransactions running;
1283 : xl_standby_lock *locks;
1284 : int nlocks;
1285 :
4859 simon 1286 1749 : Assert(XLogStandbyInfoActive());
1287 :
1288 : /*
1289 : * Get details of any AccessExclusiveLocks being held at the moment.
1290 : */
1291 1749 : locks = GetRunningTransactionLocks(&nlocks);
1292 1749 : if (nlocks > 0)
1293 27 : LogAccessExclusiveLocks(nlocks, locks);
3596 tgl 1294 1749 : pfree(locks);
1295 :
1296 : /*
1297 : * Log details of all in-progress transactions. This should be the last
1298 : * record we write, because standby will open up when it sees this.
1299 : */
4859 simon 1300 1749 : running = GetRunningTransactionData();
1301 :
1302 : /*
1303 : * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
1304 : * For Hot Standby this can be done before inserting the WAL record
1305 : * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
1306 : * the clog. For logical decoding, though, the lock can't be released
1307 : * early because the clog might be "in the future" from the POV of the
1308 : * historic snapshot. This would allow for situations where we're waiting
1309 : * for the end of a transaction listed in the xl_running_xacts record
1310 : * which, according to the WAL, has committed before the xl_running_xacts
1311 : * record. Fortunately this routine isn't executed frequently, and it's
1312 : * only a shared lock.
1313 : */
3324 rhaas 1314 1749 : if (wal_level < WAL_LEVEL_LOGICAL)
1315 1366 : LWLockRelease(ProcArrayLock);
1316 :
3371 1317 1749 : recptr = LogCurrentRunningXacts(running);
1318 :
1319 : /* Release lock if we kept it longer ... */
3324 1320 1749 : if (wal_level >= WAL_LEVEL_LOGICAL)
1321 383 : LWLockRelease(ProcArrayLock);
1322 :
1323 : /* GetRunningTransactionData() acquired XidGenLock, we must release it */
4506 heikki.linnakangas 1324 1749 : LWLockRelease(XidGenLock);
1325 :
3371 rhaas 1326 1749 : return recptr;
1327 : }
1328 :
1329 : /*
1330 : * Record an enhanced snapshot of running transactions into WAL.
1331 : *
1332 : * The definitions of RunningTransactionsData and xl_running_xacts are
1333 : * similar. We keep them separate because xl_running_xacts is a contiguous
1334 : * chunk of memory and never exists fully until it is assembled in WAL.
1335 : * The inserted records are marked as not being important for durability,
2253 heikki.linnakangas 1336 ECB : * to avoid triggering superfluous checkpoint / archiving activity.
1337 : */
1338 : static XLogRecPtr
4859 simon 1339 GIC 1749 : LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
1340 : {
1341 : xl_running_xacts xlrec;
1342 : XLogRecPtr recptr;
4859 simon 1343 ECB :
4859 simon 1344 GIC 1749 : xlrec.xcnt = CurrRunningXacts->xcnt;
3780 1345 1749 : xlrec.subxcnt = CurrRunningXacts->subxcnt;
4859 1346 1749 : xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
1347 1749 : xlrec.nextXid = CurrRunningXacts->nextXid;
4859 simon 1348 CBC 1749 : xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
4713 1349 1749 : xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
4859 simon 1350 ECB :
1351 : /* Header */
3062 heikki.linnakangas 1352 GIC 1749 : XLogBeginInsert();
2299 andres 1353 1749 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
3062 heikki.linnakangas 1354 1749 : XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
1355 :
1356 : /* array of TransactionIds */
4859 simon 1357 CBC 1749 : if (xlrec.xcnt > 0)
3062 heikki.linnakangas 1358 GIC 1281 : XLogRegisterData((char *) CurrRunningXacts->xids,
2118 tgl 1359 1281 : (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1360 :
3062 heikki.linnakangas 1361 1749 : recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1362 :
4859 simon 1363 1749 : if (CurrRunningXacts->subxid_overflow)
4751 1364 1 : elog(trace_recovery(DEBUG2),
1365 : "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1366 : CurrRunningXacts->xcnt,
1367 : LSN_FORMAT_ARGS(recptr),
1368 : CurrRunningXacts->oldestRunningXid,
1369 : CurrRunningXacts->latestCompletedXid,
1370 : CurrRunningXacts->nextXid);
4859 simon 1371 ECB : else
4751 simon 1372 CBC 1748 : elog(trace_recovery(DEBUG2),
1373 : "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
3780 simon 1374 ECB : CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1375 : LSN_FORMAT_ARGS(recptr),
1376 : CurrRunningXacts->oldestRunningXid,
4660 bruce 1377 : CurrRunningXacts->latestCompletedXid,
1378 : CurrRunningXacts->nextXid);
1379 :
1380 : /*
3371 rhaas 1381 : * Ensure running_xacts information is synced to disk not too far in the
1382 : * future. We don't want to stall anything though (i.e. use XLogFlush()),
3260 bruce 1383 : * so we let the wal writer do it during normal operation.
1384 : * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1385 : * and nudge the WALWriter into action if sleeping. Check
1386 : * XLogBackgroundFlush() for details why a record might not be flushed
1387 : * without it.
1388 : */
3371 rhaas 1389 GIC 1749 : XLogSetAsyncXactLSN(recptr);
1390 :
1391 1749 : return recptr;
1392 : }
1393 :
1394 : /*
1395 : * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
4859 simon 1396 ECB : * logged, as described in backend/storage/lmgr/README.
1397 : */
1398 : static void
4859 simon 1399 GIC 197545 : LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1400 : {
4790 bruce 1401 ECB : xl_standby_locks xlrec;
4859 simon 1402 :
4859 simon 1403 CBC 197545 : xlrec.nlocks = nlocks;
4859 simon 1404 ECB :
3062 heikki.linnakangas 1405 CBC 197545 : XLogBeginInsert();
1406 197545 : XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
3062 heikki.linnakangas 1407 GIC 197545 : XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
2299 andres 1408 197545 : XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
4859 simon 1409 ECB :
3062 heikki.linnakangas 1410 CBC 197545 : (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
4859 simon 1411 197545 : }
1412 :
1413 : /*
4859 simon 1414 ECB : * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1415 : */
1416 : void
4859 simon 1417 GIC 197518 : LogAccessExclusiveLock(Oid dbOid, Oid relOid)
4859 simon 1418 ECB : {
1419 : xl_standby_lock xlrec;
1420 :
2209 simon 1421 CBC 197518 : xlrec.xid = GetCurrentTransactionId();
1422 :
4859 simon 1423 GIC 197518 : xlrec.dbOid = dbOid;
1424 197518 : xlrec.relOid = relOid;
1425 :
1426 197518 : LogAccessExclusiveLocks(1, &xlrec);
2209 1427 197518 : MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
4859 1428 197518 : }
4514 simon 1429 ECB :
1430 : /*
1431 : * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1432 : */
1433 : void
4514 simon 1434 GIC 197730 : LogAccessExclusiveLockPrepare(void)
1435 : {
1436 : /*
1437 : * Ensure that a TransactionId has been assigned to this transaction, for
1438 : * two reasons, both related to lock release on the standby. First, we
1439 : * must assign an xid so that RecordTransactionCommit() and
1440 : * RecordTransactionAbort() do not optimise away the transaction
1441 : * completion record which recovery relies upon to release locks. It's a
1442 : * hack, but for a corner case not worth adding code for into the main
1443 : * commit path. Second, we must assign an xid before the lock is recorded
1444 : * in shared memory, otherwise a concurrently executing
1445 : * GetRunningTransactionLocks() might see a lock associated with an
4514 simon 1446 ECB : * InvalidTransactionId which we later assert cannot happen.
1447 : */
2209 simon 1448 CBC 197730 : (void) GetCurrentTransactionId();
4514 simon 1449 GIC 197730 : }
1450 :
1451 : /*
1452 : * Emit WAL for invalidations. This currently is only used for commits without
1453 : * an xid but which contain invalidations.
1454 : */
1455 : void
2542 andres 1456 CBC 43463 : LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1457 : bool relcacheInitFileInval)
1458 : {
1459 : xl_invalidations xlrec;
2542 andres 1460 ECB :
1461 : /* prepare record */
2542 andres 1462 CBC 43463 : memset(&xlrec, 0, sizeof(xlrec));
1463 43463 : xlrec.dbId = MyDatabaseId;
1464 43463 : xlrec.tsId = MyDatabaseTableSpace;
1465 43463 : xlrec.relcacheInitFileInval = relcacheInitFileInval;
2542 andres 1466 GIC 43463 : xlrec.nmsgs = nmsgs;
2542 andres 1467 ECB :
1468 : /* perform insertion */
2542 andres 1469 GIC 43463 : XLogBeginInsert();
1470 43463 : XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1471 43463 : XLogRegisterData((char *) msgs,
1472 : nmsgs * sizeof(SharedInvalidationMessage));
1473 43463 : XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
2542 andres 1474 CBC 43463 : }
1475 :
1476 : /* Return the description of recovery conflict */
1477 : static const char *
821 fujii 1478 10 : get_recovery_conflict_desc(ProcSignalReason reason)
1479 : {
650 peter 1480 10 : const char *reasonDesc = _("unknown reason");
821 fujii 1481 ECB :
821 fujii 1482 GIC 10 : switch (reason)
821 fujii 1483 ECB : {
821 fujii 1484 CBC 4 : case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
650 peter 1485 4 : reasonDesc = _("recovery conflict on buffer pin");
821 fujii 1486 GIC 4 : break;
1487 2 : case PROCSIG_RECOVERY_CONFLICT_LOCK:
650 peter 1488 2 : reasonDesc = _("recovery conflict on lock");
821 fujii 1489 2 : break;
1490 2 : case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
650 peter 1491 CBC 2 : reasonDesc = _("recovery conflict on tablespace");
821 fujii 1492 GIC 2 : break;
1493 2 : case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
650 peter 1494 2 : reasonDesc = _("recovery conflict on snapshot");
821 fujii 1495 2 : break;
2 andres 1496 UNC 0 : case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
1497 0 : reasonDesc = _("recovery conflict on replication slot");
1498 0 : break;
821 fujii 1499 UIC 0 : case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
650 peter 1500 0 : reasonDesc = _("recovery conflict on buffer deadlock");
821 fujii 1501 0 : break;
1502 0 : case PROCSIG_RECOVERY_CONFLICT_DATABASE:
650 peter 1503 0 : reasonDesc = _("recovery conflict on database");
821 fujii 1504 0 : break;
1505 0 : default:
1506 0 : break;
1507 : }
821 fujii 1508 ECB :
821 fujii 1509 CBC 10 : return reasonDesc;
1510 : }
|