TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * clog.c
4 : * PostgreSQL transaction-commit-log manager
5 : *
6 : * This module replaces the old "pg_log" access code, which treated pg_log
7 : * essentially like a relation, in that it went through the regular buffer
8 : * manager. The problem with that was that there wasn't any good way to
9 : * recycle storage space for transactions so old that they'll never be
10 : * looked up again. Now we use specialized access code so that the commit
11 : * log can be broken into relatively small, independent segments.
12 : *
13 : * XLOG interactions: this module generates an XLOG record whenever a new
14 : * CLOG page is initialized to zeroes. Other writes of CLOG come from
15 : * recording of transaction commit or abort in xact.c, which generates its
16 : * own XLOG records for these events and will re-perform the status update
17 : * on redo; so we need make no additional XLOG entry here. For synchronous
18 : * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
19 : * record before we are called to log a commit, so the WAL rule "write xlog
20 : * before data" is satisfied automatically. However, for async commits we
21 : * must track the latest LSN affecting each CLOG page, so that we can flush
22 : * XLOG that far and satisfy the WAL rule. We don't have to worry about this
23 : * for aborts (whether sync or async), since the post-crash assumption would
24 : * be that such transactions failed anyway.
25 : *
26 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : * src/backend/access/transam/clog.c
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include "access/clog.h"
36 : #include "access/slru.h"
37 : #include "access/transam.h"
38 : #include "access/xlog.h"
39 : #include "access/xloginsert.h"
40 : #include "access/xlogutils.h"
41 : #include "miscadmin.h"
42 : #include "pg_trace.h"
43 : #include "pgstat.h"
44 : #include "storage/proc.h"
45 : #include "storage/sync.h"
46 :
47 : /*
48 : * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
49 : * everywhere else in Postgres.
50 : *
51 : * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
52 : * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
53 : * and CLOG segment numbering at
54 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
55 : * explicit notice of that fact in this module, except when comparing segment
56 : * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
57 : */
58 :
59 : /* We need two bits per xact, so four xacts fit in a byte */
60 : #define CLOG_BITS_PER_XACT 2
61 : #define CLOG_XACTS_PER_BYTE 4
62 : #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
63 : #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
64 :
65 : #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
66 : #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
67 : #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
68 : #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
69 :
70 : /* We store the latest async LSN for each group of transactions */
71 : #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
72 : #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
73 :
74 : #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
75 : ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
76 :
77 : /*
78 : * The number of subtransactions below which we consider to apply clog group
79 : * update optimization. Testing reveals that the number higher than this can
80 : * hurt performance.
81 : */
82 : #define THRESHOLD_SUBTRANS_CLOG_OPT 5
83 :
84 : /*
85 : * Link to shared-memory data structures for CLOG control
86 : */
87 : static SlruCtlData XactCtlData;
88 :
89 : #define XactCtl (&XactCtlData)
90 :
91 :
92 : static int ZeroCLOGPage(int pageno, bool writeXlog);
93 : static bool CLOGPagePrecedes(int page1, int page2);
94 : static void WriteZeroPageXlogRec(int pageno);
95 : static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact,
96 : Oid oldestXactDb);
97 : static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
98 : TransactionId *subxids, XidStatus status,
99 : XLogRecPtr lsn, int pageno,
100 : bool all_xact_same_page);
101 : static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
102 : XLogRecPtr lsn, int slotno);
103 : static void set_status_by_pages(int nsubxids, TransactionId *subxids,
104 : XidStatus status, XLogRecPtr lsn);
105 : static bool TransactionGroupUpdateXidStatus(TransactionId xid,
106 : XidStatus status, XLogRecPtr lsn, int pageno);
107 : static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
108 : TransactionId *subxids, XidStatus status,
109 : XLogRecPtr lsn, int pageno);
110 :
111 :
112 : /*
113 : * TransactionIdSetTreeStatus
114 : *
115 : * Record the final state of transaction entries in the commit log for
116 : * a transaction and its subtransaction tree. Take care to ensure this is
117 : * efficient, and as atomic as possible.
118 : *
119 : * xid is a single xid to set status for. This will typically be
120 : * the top level transactionid for a top level commit or abort. It can
121 : * also be a subtransaction when we record transaction aborts.
122 : *
123 : * subxids is an array of xids of length nsubxids, representing subtransactions
124 : * in the tree of xid. In various cases nsubxids may be zero.
125 : *
126 : * lsn must be the WAL location of the commit record when recording an async
127 : * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
128 : * caller guarantees the commit record is already flushed in that case. It
129 : * should be InvalidXLogRecPtr for abort cases, too.
130 : *
131 : * In the commit case, atomicity is limited by whether all the subxids are in
132 : * the same CLOG page as xid. If they all are, then the lock will be grabbed
133 : * only once, and the status will be set to committed directly. Otherwise
134 : * we must
135 : * 1. set sub-committed all subxids that are not on the same page as the
136 : * main xid
137 : * 2. atomically set committed the main xid and the subxids on the same page
138 : * 3. go over the first bunch again and set them committed
139 : * Note that as far as concurrent checkers are concerned, main transaction
140 : * commit as a whole is still atomic.
141 : *
142 : * Example:
143 : * TransactionId t commits and has subxids t1, t2, t3, t4
144 : * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
145 : * 1. update pages2-3:
146 : * page2: set t2,t3 as sub-committed
147 : * page3: set t4 as sub-committed
148 : * 2. update page1:
149 : * page1: set t,t1 as committed
150 : * 3. update pages2-3:
151 : * page2: set t2,t3 as committed
152 : * page3: set t4 as committed
153 : *
154 : * NB: this is a low-level routine and is NOT the preferred entry point
155 : * for most uses; functions in transam.c are the intended callers.
156 : *
157 : * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
158 : * but aren't yet in cache, as well as hinting pages not to fall out of
159 : * cache yet.
160 ECB : */
161 : void
162 GIC 318992 : TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
163 ECB : TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
164 : {
165 GIC 318992 : int pageno = TransactionIdToPage(xid); /* get page of parent */
166 ECB : int i;
167 :
168 GIC 318992 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
169 : status == TRANSACTION_STATUS_ABORTED);
170 :
171 : /*
172 : * See how many subxids, if any, are on the same page as the parent, if
173 ECB : * any.
174 : */
175 CBC 323134 : for (i = 0; i < nsubxids; i++)
176 EUB : {
177 GIC 4142 : if (TransactionIdToPage(subxids[i]) != pageno)
178 UIC 0 : break;
179 : }
180 :
181 : /*
182 ECB : * Do all items fit on a single page?
183 : */
184 GIC 318992 : if (i == nsubxids)
185 : {
186 : /*
187 ECB : * Set the parent and all subtransactions in a single call
188 : */
189 GIC 318992 : TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
190 : pageno, true);
191 : }
192 EUB : else
193 : {
194 UIC 0 : int nsubxids_on_first_page = i;
195 :
196 : /*
197 : * If this is a commit then we care about doing this correctly (i.e.
198 : * using the subcommitted intermediate status). By here, we know
199 : * we're updating more than one page of clog, so we must mark entries
200 : * that are *not* on the first page so that they show as subcommitted
201 : * before we then return to update the status to fully committed.
202 : *
203 : * To avoid touching the first page twice, skip marking subcommitted
204 EUB : * for the subxids on that first page.
205 : */
206 UBC 0 : if (status == TRANSACTION_STATUS_COMMITTED)
207 UIC 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
208 0 : subxids + nsubxids_on_first_page,
209 : TRANSACTION_STATUS_SUB_COMMITTED, lsn);
210 :
211 : /*
212 : * Now set the parent and subtransactions on same page as the parent,
213 EUB : * if any
214 : */
215 UIC 0 : pageno = TransactionIdToPage(xid);
216 0 : TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
217 : lsn, pageno, false);
218 :
219 : /*
220 : * Now work through the rest of the subxids one clog page at a time,
221 EUB : * starting from the second page onwards, like we did above.
222 : */
223 UIC 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
224 0 : subxids + nsubxids_on_first_page,
225 ECB : status, lsn);
226 : }
227 GIC 318992 : }
228 :
229 : /*
230 : * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
231 : * transactions, chunking in the separate CLOG pages involved. We never
232 : * pass the whole transaction tree to this function, only subtransactions
233 : * that are on different pages to the top level transaction id.
234 EUB : */
235 : static void
236 UIC 0 : set_status_by_pages(int nsubxids, TransactionId *subxids,
237 EUB : XidStatus status, XLogRecPtr lsn)
238 : {
239 UBC 0 : int pageno = TransactionIdToPage(subxids[0]);
240 UIC 0 : int offset = 0;
241 UBC 0 : int i = 0;
242 :
243 0 : Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
244 :
245 0 : while (i < nsubxids)
246 : {
247 UIC 0 : int num_on_page = 0;
248 : int nextpageno;
249 :
250 EUB : do
251 : {
252 UBC 0 : nextpageno = TransactionIdToPage(subxids[i]);
253 0 : if (nextpageno != pageno)
254 0 : break;
255 0 : num_on_page++;
256 UIC 0 : i++;
257 UBC 0 : } while (i < nsubxids);
258 EUB :
259 UIC 0 : TransactionIdSetPageStatus(InvalidTransactionId,
260 UBC 0 : num_on_page, subxids + offset,
261 EUB : status, lsn, pageno, false);
262 UIC 0 : offset = i;
263 UBC 0 : pageno = nextpageno;
264 : }
265 UIC 0 : }
266 :
267 : /*
268 : * Record the final state of transaction entries in the commit log for all
269 : * entries on a single page. Atomic only on this page.
270 ECB : */
271 : static void
272 GIC 318992 : TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
273 : TransactionId *subxids, XidStatus status,
274 : XLogRecPtr lsn, int pageno,
275 : bool all_xact_same_page)
276 : {
277 : /* Can't use group update when PGPROC overflows. */
278 : StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
279 : "group clog threshold less than PGPROC cached subxids");
280 :
281 : /*
282 : * When there is contention on XactSLRULock, we try to group multiple
283 : * updates; a single leader process will perform transaction status
284 : * updates for multiple backends so that the number of times XactSLRULock
285 : * needs to be acquired is reduced.
286 : *
287 : * For this optimization to be safe, the XID and subxids in MyProc must be
288 : * the same as the ones for which we're setting the status. Check that
289 : * this is the case.
290 : *
291 : * For this optimization to be efficient, we shouldn't have too many
292 : * sub-XIDs and all of the XIDs for which we're adjusting clog should be
293 ECB : * on the same page. Check those conditions, too.
294 : */
295 CBC 318992 : if (all_xact_same_page && xid == MyProc->xid &&
296 297658 : nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
297 297658 : nsubxids == MyProc->subxidStatus.count &&
298 GIC 457 : (nsubxids == 0 ||
299 457 : memcmp(subxids, MyProc->subxids.xids,
300 : nsubxids * sizeof(TransactionId)) == 0))
301 : {
302 : /*
303 : * If we can immediately acquire XactSLRULock, we update the status of
304 : * our own XID and release the lock. If not, try use group XID
305 : * update. If that doesn't work out, fall back to waiting for the
306 ECB : * lock to perform an update for this transaction only.
307 : */
308 GIC 297658 : if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE))
309 ECB : {
310 : /* Got the lock without waiting! Do the update. */
311 CBC 297604 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
312 ECB : lsn, pageno);
313 GIC 297604 : LWLockRelease(XactSLRULock);
314 CBC 297604 : return;
315 : }
316 GIC 54 : else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
317 ECB : {
318 : /* Group update mechanism has done the work. */
319 GIC 54 : return;
320 : }
321 :
322 : /* Fall through only if update isn't done yet. */
323 : }
324 ECB :
325 : /* Group update not applicable, or couldn't accept this page number. */
326 GIC 21334 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
327 CBC 21334 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
328 : lsn, pageno);
329 GIC 21334 : LWLockRelease(XactSLRULock);
330 : }
331 :
332 : /*
333 : * Record the final state of transaction entry in the commit log
334 : *
335 : * We don't do any locking here; caller must handle that.
336 ECB : */
337 : static void
338 GIC 318992 : TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
339 : TransactionId *subxids, XidStatus status,
340 : XLogRecPtr lsn, int pageno)
341 : {
342 : int slotno;
343 ECB : int i;
344 :
345 GIC 318992 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
346 ECB : status == TRANSACTION_STATUS_ABORTED ||
347 : (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
348 GIC 318992 : Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE));
349 :
350 : /*
351 : * If we're doing an async commit (ie, lsn is valid), then we must wait
352 : * for any active write on the page slot to complete. Otherwise our
353 : * update could reach disk in that write, which will not do since we
354 : * mustn't let it reach disk until we've done the appropriate WAL flush.
355 : * But when lsn is invalid, it's OK to scribble on a page while it is
356 : * write-busy, since we don't care if the update reaches disk sooner than
357 ECB : * we think.
358 : */
359 GIC 318992 : slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
360 :
361 : /*
362 : * Set the main transaction id, if any.
363 : *
364 : * If we update more than one xid on this page while it is being written
365 : * out, we might find that some of the bits go to disk and others don't.
366 : * If we are updating commits on the page with the top-level xid that
367 : * could break atomicity, so we subcommit the subxids first before we mark
368 ECB : * the top-level commit.
369 : */
370 GIC 318992 : if (TransactionIdIsValid(xid))
371 ECB : {
372 : /* Subtransactions first, if needed ... */
373 CBC 318992 : if (status == TRANSACTION_STATUS_COMMITTED)
374 : {
375 316569 : for (i = 0; i < nsubxids; i++)
376 ECB : {
377 GIC 3824 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
378 3824 : TransactionIdSetStatusBit(subxids[i],
379 : TRANSACTION_STATUS_SUB_COMMITTED,
380 : lsn, slotno);
381 : }
382 : }
383 ECB :
384 : /* ... then the main transaction */
385 GIC 318992 : TransactionIdSetStatusBit(xid, status, lsn, slotno);
386 : }
387 ECB :
388 : /* Set the subtransactions */
389 CBC 323134 : for (i = 0; i < nsubxids; i++)
390 ECB : {
391 GIC 4142 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
392 4142 : TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
393 ECB : }
394 :
395 GIC 318992 : XactCtl->shared->page_dirty[slotno] = true;
396 318992 : }
397 :
398 : /*
399 : * When we cannot immediately acquire XactSLRULock in exclusive mode at
400 : * commit time, add ourselves to a list of processes that need their XIDs
401 : * status update. The first process to add itself to the list will acquire
402 : * XactSLRULock in exclusive mode and set transaction status as required
403 : * on behalf of all group members. This avoids a great deal of contention
404 : * around XactSLRULock when many processes are trying to commit at once,
405 : * since the lock need not be repeatedly handed off from one committing
406 : * process to the next.
407 : *
408 : * Returns true when transaction status has been updated in clog; returns
409 : * false if we decided against applying the optimization because the page
410 : * number we need to update differs from those processes already waiting.
411 ECB : */
412 : static bool
413 GIC 54 : TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
414 ECB : XLogRecPtr lsn, int pageno)
415 : {
416 GIC 54 : volatile PROC_HDR *procglobal = ProcGlobal;
417 54 : PGPROC *proc = MyProc;
418 : uint32 nextidx;
419 : uint32 wakeidx;
420 ECB :
421 : /* We should definitely have an XID whose status needs to be updated. */
422 GIC 54 : Assert(TransactionIdIsValid(xid));
423 :
424 : /*
425 : * Add ourselves to the list of processes needing a group XID status
426 ECB : * update.
427 : */
428 CBC 54 : proc->clogGroupMember = true;
429 54 : proc->clogGroupMemberXid = xid;
430 54 : proc->clogGroupMemberXidStatus = status;
431 GIC 54 : proc->clogGroupMemberPage = pageno;
432 CBC 54 : proc->clogGroupMemberLsn = lsn;
433 :
434 GIC 54 : nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
435 :
436 : while (true)
437 : {
438 : /*
439 : * Add the proc to list, if the clog page where we need to update the
440 : * current transaction status is same as group leader's clog page.
441 : *
442 : * There is a race condition here, which is that after doing the below
443 : * check and before adding this proc's clog update to a group, the
444 : * group leader might have already finished the group update for this
445 : * page and becomes group leader of another group. This will lead to a
446 : * situation where a single group can have different clog page
447 : * updates. This isn't likely and will still work, just maybe a bit
448 ECB : * less efficiently.
449 EUB : */
450 GIC 54 : if (nextidx != INVALID_PGPROCNO &&
451 UIC 0 : ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage)
452 : {
453 : /*
454 : * Ensure that this proc is not a member of any clog group that
455 EUB : * needs an XID status update.
456 : */
457 UBC 0 : proc->clogGroupMember = false;
458 UIC 0 : pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO);
459 0 : return false;
460 ECB : }
461 :
462 CBC 54 : pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
463 :
464 54 : if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
465 ECB : &nextidx,
466 GIC 54 : (uint32) proc->pgprocno))
467 54 : break;
468 : }
469 :
470 : /*
471 : * If the list was not empty, the leader will update the status of our
472 : * XID. It is impossible to have followers without a leader because the
473 : * first process that has added itself to the list will always have
474 ECB : * nextidx as INVALID_PGPROCNO.
475 : */
476 GBC 54 : if (nextidx != INVALID_PGPROCNO)
477 : {
478 UIC 0 : int extraWaits = 0;
479 EUB :
480 : /* Sleep until the leader updates our XID status. */
481 UIC 0 : pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
482 : for (;;)
483 EUB : {
484 : /* acts as a read barrier */
485 UBC 0 : PGSemaphoreLock(proc->sem);
486 0 : if (!proc->clogGroupMember)
487 UIC 0 : break;
488 UBC 0 : extraWaits++;
489 : }
490 0 : pgstat_report_wait_end();
491 :
492 UIC 0 : Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO);
493 EUB :
494 : /* Fix semaphore count for any absorbed wakeups */
495 UBC 0 : while (extraWaits-- > 0)
496 UIC 0 : PGSemaphoreUnlock(proc->sem);
497 0 : return true;
498 : }
499 ECB :
500 : /* We are the leader. Acquire the lock on behalf of everyone. */
501 GIC 54 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
502 :
503 : /*
504 : * Now that we've got the lock, clear the list of processes waiting for
505 : * group XID status update, saving a pointer to the head of the list.
506 ECB : * Trying to pop elements one at a time could lead to an ABA problem.
507 : */
508 GIC 54 : nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
509 : INVALID_PGPROCNO);
510 ECB :
511 : /* Remember head of list so we can perform wakeups after dropping lock. */
512 GIC 54 : wakeidx = nextidx;
513 ECB :
514 : /* Walk the list and update the status of all XIDs. */
515 CBC 108 : while (nextidx != INVALID_PGPROCNO)
516 : {
517 GNC 54 : PGPROC *nextproc = &ProcGlobal->allProcs[nextidx];
518 :
519 : /*
520 : * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
521 ECB : * should not use group XID status update mechanism.
522 : */
523 GNC 54 : Assert(nextproc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
524 ECB :
525 GNC 54 : TransactionIdSetPageStatusInternal(nextproc->clogGroupMemberXid,
526 54 : nextproc->subxidStatus.count,
527 54 : nextproc->subxids.xids,
528 : nextproc->clogGroupMemberXidStatus,
529 : nextproc->clogGroupMemberLsn,
530 : nextproc->clogGroupMemberPage);
531 ECB :
532 : /* Move to next proc in list. */
533 GNC 54 : nextidx = pg_atomic_read_u32(&nextproc->clogGroupNext);
534 : }
535 ECB :
536 : /* We're done with the lock now. */
537 GIC 54 : LWLockRelease(XactSLRULock);
538 :
539 : /*
540 : * Now that we've released the lock, go back and wake everybody up. We
541 : * don't do this under the lock so as to keep lock hold times to a
542 ECB : * minimum.
543 : */
544 CBC 108 : while (wakeidx != INVALID_PGPROCNO)
545 : {
546 GNC 54 : PGPROC *wakeproc = &ProcGlobal->allProcs[wakeidx];
547 ECB :
548 GNC 54 : wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
549 54 : pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PGPROCNO);
550 ECB :
551 : /* ensure all previous writes are visible before follower continues. */
552 CBC 54 : pg_write_barrier();
553 :
554 GNC 54 : wakeproc->clogGroupMember = false;
555 EUB :
556 GNC 54 : if (wakeproc != MyProc)
557 UNC 0 : PGSemaphoreUnlock(wakeproc->sem);
558 ECB : }
559 :
560 GIC 54 : return true;
561 : }
562 :
563 : /*
564 : * Sets the commit status of a single transaction.
565 : *
566 : * Must be called with XactSLRULock held
567 ECB : */
568 : static void
569 CBC 326958 : TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
570 ECB : {
571 GIC 326958 : int byteno = TransactionIdToByte(xid);
572 326958 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
573 : char *byteptr;
574 : char byteval;
575 ECB : char curval;
576 :
577 GIC 326958 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
578 326958 : curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
579 :
580 : /*
581 : * When replaying transactions during recovery we still need to perform
582 : * the two phases of subcommit and then commit. However, some transactions
583 : * are already correctly marked, so we just treat those as a no-op which
584 ECB : * allows us to keep the following Assert as restrictive as possible.
585 : */
586 GBC 326958 : if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
587 : curval == TRANSACTION_STATUS_COMMITTED)
588 UIC 0 : return;
589 :
590 : /*
591 : * Current state change should be from 0 or subcommitted to target state
592 ECB : * or we should already be there when replaying changes during recovery.
593 : */
594 GIC 326958 : Assert(curval == 0 ||
595 : (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
596 : status != TRANSACTION_STATUS_IN_PROGRESS) ||
597 : curval == status);
598 ECB :
599 : /* note this assumes exclusive access to the clog page */
600 CBC 326958 : byteval = *byteptr;
601 326958 : byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
602 GIC 326958 : byteval |= (status << bshift);
603 326958 : *byteptr = byteval;
604 :
605 : /*
606 : * Update the group LSN if the transaction completion LSN is higher.
607 : *
608 : * Note: lsn will be invalid when supplied during InRecovery processing,
609 : * so we don't need to do anything special to avoid LSN updates during
610 : * recovery. After recovery completes the next clog change will set the
611 ECB : * LSN correctly.
612 : */
613 CBC 326958 : if (!XLogRecPtrIsInvalid(lsn))
614 : {
615 26381 : int lsnindex = GetLSNIndex(slotno, xid);
616 ECB :
617 GIC 26381 : if (XactCtl->shared->group_lsn[lsnindex] < lsn)
618 23769 : XactCtl->shared->group_lsn[lsnindex] = lsn;
619 : }
620 : }
621 :
622 : /*
623 : * Interrogate the state of a transaction in the commit log.
624 : *
625 : * Aside from the actual commit status, this function returns (into *lsn)
626 : * an LSN that is late enough to be able to guarantee that if we flush up to
627 : * that LSN then we will have flushed the transaction's commit record to disk.
628 : * The result is not necessarily the exact LSN of the transaction's commit
629 : * record! For example, for long-past transactions (those whose clog pages
630 : * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
631 : * we group transactions on the same clog page to conserve storage, we might
632 : * return the LSN of a later transaction that falls into the same group.
633 : *
634 : * NB: this is a low-level routine and is NOT the preferred entry point
635 : * for most uses; TransactionLogFetch() in transam.c is the intended caller.
636 ECB : */
637 : XidStatus
638 CBC 1799999 : TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
639 ECB : {
640 CBC 1799999 : int pageno = TransactionIdToPage(xid);
641 GIC 1799999 : int byteno = TransactionIdToByte(xid);
642 1799999 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
643 : int slotno;
644 : int lsnindex;
645 : char *byteptr;
646 : XidStatus status;
647 :
648 ECB : /* lock is acquired by SimpleLruReadPage_ReadOnly */
649 :
650 GIC 1799999 : slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
651 CBC 1799999 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
652 :
653 1799999 : status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
654 ECB :
655 GIC 1799999 : lsnindex = GetLSNIndex(slotno, xid);
656 CBC 1799999 : *lsn = XactCtl->shared->group_lsn[lsnindex];
657 :
658 1799999 : LWLockRelease(XactSLRULock);
659 :
660 GIC 1799999 : return status;
661 : }
662 :
663 : /*
664 : * Number of shared CLOG buffers.
665 : *
666 : * On larger multi-processor systems, it is possible to have many CLOG page
667 : * requests in flight at one time which could lead to disk access for CLOG
668 : * page if the required page is not found in memory. Testing revealed that we
669 : * can get the best performance by having 128 CLOG buffers, more than that it
670 : * doesn't improve performance.
671 : *
672 : * Unconditionally keeping the number of CLOG buffers to 128 did not seem like
673 : * a good idea, because it would increase the minimum amount of shared memory
674 : * required to start, which could be a problem for people running very small
675 : * configurations. The following formula seems to represent a reasonable
676 : * compromise: people with very low values for shared_buffers will get fewer
677 : * CLOG buffers as well, and everyone else will get 128.
678 ECB : */
679 : Size
680 CBC 4564 : CLOGShmemBuffers(void)
681 : {
682 GIC 4564 : return Min(128, Max(4, NBuffers / 512));
683 : }
684 :
685 : /*
686 : * Initialization of shared memory for CLOG
687 ECB : */
688 : Size
689 CBC 2738 : CLOGShmemSize(void)
690 : {
691 GIC 2738 : return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
692 : }
693 ECB :
694 : void
695 CBC 1826 : CLOGShmemInit(void)
696 ECB : {
697 CBC 1826 : XactCtl->PagePrecedes = CLOGPagePrecedes;
698 GIC 1826 : SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
699 CBC 1826 : XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
700 ECB : SYNC_HANDLER_CLOG);
701 GIC 1826 : SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
702 1826 : }
703 :
704 : /*
705 : * This func must be called ONCE on system install. It creates
706 : * the initial CLOG segment. (The CLOG directory is assumed to
707 : * have been created by initdb, and CLOGShmemInit must have been
708 : * called already.)
709 ECB : */
710 : void
711 GIC 305 : BootStrapCLOG(void)
712 : {
713 ECB : int slotno;
714 :
715 GIC 305 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
716 ECB :
717 : /* Create and zero the first page of the commit log */
718 GIC 305 : slotno = ZeroCLOGPage(0, false);
719 ECB :
720 : /* Make sure it's written out */
721 GIC 305 : SimpleLruWritePage(XactCtl, slotno);
722 CBC 305 : Assert(!XactCtl->shared->page_dirty[slotno]);
723 ECB :
724 GIC 305 : LWLockRelease(XactSLRULock);
725 305 : }
726 :
727 : /*
728 : * Initialize (or reinitialize) a page of CLOG to zeroes.
729 : * If writeXlog is true, also emit an XLOG record saying we did this.
730 : *
731 : * The page is not actually written, just set up in shared memory.
732 : * The slot number of the new page is returned.
733 : *
734 : * Control lock must be held at entry, and will be held at exit.
735 ECB : */
736 : static int
737 GIC 608 : ZeroCLOGPage(int pageno, bool writeXlog)
738 : {
739 ECB : int slotno;
740 :
741 CBC 608 : slotno = SimpleLruZeroPage(XactCtl, pageno);
742 ECB :
743 GIC 608 : if (writeXlog)
744 CBC 303 : WriteZeroPageXlogRec(pageno);
745 :
746 GIC 608 : return slotno;
747 : }
748 :
749 : /*
750 : * This must be called ONCE during postmaster or standalone-backend startup,
751 : * after StartupXLOG has initialized ShmemVariableCache->nextXid.
752 ECB : */
753 : void
754 CBC 1176 : StartupCLOG(void)
755 ECB : {
756 GIC 1176 : TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
757 CBC 1176 : int pageno = TransactionIdToPage(xid);
758 :
759 GIC 1176 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
760 :
761 : /*
762 ECB : * Initialize our idea of the latest page number.
763 : */
764 CBC 1176 : XactCtl->shared->latest_page_number = pageno;
765 ECB :
766 GIC 1176 : LWLockRelease(XactSLRULock);
767 1176 : }
768 :
769 : /*
770 : * This must be called ONCE at the end of startup/recovery.
771 ECB : */
772 : void
773 CBC 1142 : TrimCLOG(void)
774 ECB : {
775 GIC 1142 : TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
776 CBC 1142 : int pageno = TransactionIdToPage(xid);
777 :
778 GIC 1142 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
779 :
780 : /*
781 : * Zero out the remainder of the current clog page. Under normal
782 : * circumstances it should be zeroes already, but it seems at least
783 : * theoretically possible that XLOG replay will have settled on a nextXID
784 : * value that is less than the last XID actually used and marked by the
785 : * previous database lifecycle (since subtransaction commit writes clog
786 : * but makes no WAL entry). Let's just be safe. (We need not worry about
787 : * pages beyond the current one, since those will be zeroed when first
788 : * used. For the same reason, there is no need to do anything when
789 : * nextXid is exactly at a page boundary; and it's likely that the
790 ECB : * "current" page doesn't exist yet in that case.)
791 : */
792 CBC 1142 : if (TransactionIdToPgIndex(xid) != 0)
793 ECB : {
794 GIC 1142 : int byteno = TransactionIdToByte(xid);
795 1142 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
796 : int slotno;
797 ECB : char *byteptr;
798 :
799 GIC 1142 : slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
800 1142 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
801 ECB :
802 : /* Zero so-far-unused positions in the current byte */
803 CBC 1142 : *byteptr &= (1 << bshift) - 1;
804 : /* Zero the rest of the page */
805 1142 : MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
806 :
807 GIC 1142 : XactCtl->shared->page_dirty[slotno] = true;
808 ECB : }
809 :
810 GIC 1142 : LWLockRelease(XactSLRULock);
811 1142 : }
812 :
813 : /*
814 : * Perform a checkpoint --- either during shutdown, or on-the-fly
815 ECB : */
816 : void
817 GIC 2363 : CheckPointCLOG(void)
818 : {
819 : /*
820 : * Write dirty CLOG pages to disk. This may result in sync requests
821 : * queued for later handling by ProcessSyncRequests(), as part of the
822 : * checkpoint.
823 ECB : */
824 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
825 CBC 2363 : SimpleLruWriteAll(XactCtl, true);
826 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
827 GIC 2363 : }
828 :
829 :
830 : /*
831 : * Make sure that CLOG has room for a newly-allocated XID.
832 : *
833 : * NB: this is called while holding XidGenLock. We want it to be very fast
834 : * most of the time; even when it's not so fast, no actual I/O need happen
835 : * unless we're forced to write out a dirty clog or xlog page to make room
836 : * in shared memory.
837 ECB : */
838 : void
839 GIC 301130 : ExtendCLOG(TransactionId newestXact)
840 : {
841 : int pageno;
842 :
843 : /*
844 : * No work except at first XID of a page. But beware: just after
845 ECB : * wraparound, the first XID of page zero is FirstNormalTransactionId.
846 : */
847 CBC 301130 : if (TransactionIdToPgIndex(newestXact) != 0 &&
848 : !TransactionIdEquals(newestXact, FirstNormalTransactionId))
849 300827 : return;
850 :
851 303 : pageno = TransactionIdToPage(newestXact);
852 :
853 GIC 303 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
854 ECB :
855 : /* Zero the page and make an XLOG entry about it */
856 CBC 303 : ZeroCLOGPage(pageno, true);
857 :
858 GIC 303 : LWLockRelease(XactSLRULock);
859 : }
860 :
861 :
862 : /*
863 : * Remove all CLOG segments before the one holding the passed transaction ID
864 : *
865 : * Before removing any CLOG data, we must flush XLOG to disk, to ensure
866 : * that any recently-emitted FREEZE_PAGE records have reached disk; otherwise
867 : * a crash and restart might leave us with some unfrozen tuples referencing
868 : * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
869 : * Replaying the deletion from XLOG is not critical, since the files could
870 : * just as well be removed later, but doing so prevents a long-running hot
871 : * standby server from acquiring an unreasonably bloated CLOG directory.
872 : *
873 : * Since CLOG segments hold a large number of transactions, the opportunity to
874 : * actually remove a segment is fairly rare, and so it seems best not to do
875 : * the XLOG flush unless we have confirmed that there is a removable segment.
876 ECB : */
877 : void
878 GIC 317 : TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
879 : {
880 : int cutoffPage;
881 :
882 : /*
883 : * The cutoff point is the start of the segment containing oldestXact. We
884 ECB : * pass the *page* containing oldestXact to SimpleLruTruncate.
885 : */
886 GIC 317 : cutoffPage = TransactionIdToPage(oldestXact);
887 ECB :
888 : /* Check to see if there's any files that could be removed */
889 GIC 317 : if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
890 317 : return; /* nothing to remove */
891 :
892 : /*
893 : * Advance oldestClogXid before truncating clog, so concurrent xact status
894 : * lookups can ensure they don't attempt to access truncated-away clog.
895 : *
896 : * It's only necessary to do this if we will actually truncate away clog
897 EUB : * pages.
898 : */
899 UIC 0 : AdvanceOldestClogXid(oldestXact);
900 :
901 : /*
902 : * Write XLOG record and flush XLOG to disk. We record the oldest xid
903 : * we're keeping information about here so we can ensure that it's always
904 : * ahead of clog truncation in case we crash, and so a standby finds out
905 EUB : * the new valid xid before the next checkpoint.
906 : */
907 UIC 0 : WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
908 EUB :
909 : /* Now we can remove the old CLOG segment(s) */
910 UIC 0 : SimpleLruTruncate(XactCtl, cutoffPage);
911 : }
912 :
913 :
914 : /*
915 : * Decide whether a CLOG page number is "older" for truncation purposes.
916 : *
917 : * We need to use comparison of TransactionIds here in order to do the right
918 : * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
919 : * would get weird about permanent xact IDs. So, offset both such that xid1,
920 : * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
921 : * is relevant to page 0 and to the page preceding page 0.
922 : *
923 : * The page containing oldestXact-2^31 is the important edge case. The
924 : * portion of that page equaling or following oldestXact-2^31 is expendable,
925 : * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
926 : * the first XID of a page and segment, the entire page and segment is
927 : * expendable, and we could truncate the segment. Recognizing that case would
928 : * require making oldestXact, not just the page containing oldestXact,
929 : * available to this callback. The benefit would be rare and small, so we
930 : * don't optimize that edge case.
931 ECB : */
932 : static bool
933 GIC 71531 : CLOGPagePrecedes(int page1, int page2)
934 : {
935 : TransactionId xid1;
936 ECB : TransactionId xid2;
937 :
938 CBC 71531 : xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
939 71531 : xid1 += FirstNormalTransactionId + 1;
940 GIC 71531 : xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
941 CBC 71531 : xid2 += FirstNormalTransactionId + 1;
942 ECB :
943 GIC 119007 : return (TransactionIdPrecedes(xid1, xid2) &&
944 47476 : TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
945 : }
946 :
947 :
948 : /*
949 : * Write a ZEROPAGE xlog record
950 ECB : */
951 : static void
952 CBC 303 : WriteZeroPageXlogRec(int pageno)
953 ECB : {
954 CBC 303 : XLogBeginInsert();
955 303 : XLogRegisterData((char *) (&pageno), sizeof(int));
956 GIC 303 : (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
957 303 : }
958 :
959 : /*
960 : * Write a TRUNCATE xlog record
961 : *
962 : * We must flush the xlog record to disk before returning --- see notes
963 : * in TruncateCLOG().
964 EUB : */
965 : static void
966 UIC 0 : WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb)
967 : {
968 : XLogRecPtr recptr;
969 EUB : xl_clog_truncate xlrec;
970 :
971 UBC 0 : xlrec.pageno = pageno;
972 UIC 0 : xlrec.oldestXact = oldestXact;
973 UBC 0 : xlrec.oldestXactDb = oldestXactDb;
974 EUB :
975 UBC 0 : XLogBeginInsert();
976 0 : XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate));
977 0 : recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
978 UIC 0 : XLogFlush(recptr);
979 0 : }
980 :
981 : /*
982 : * CLOG resource manager's routines
983 EUB : */
984 : void
985 UBC 0 : clog_redo(XLogReaderState *record)
986 : {
987 UIC 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
988 EUB :
989 : /* Backup blocks are not used in clog records */
990 UBC 0 : Assert(!XLogRecHasAnyBlockRefs(record));
991 :
992 UIC 0 : if (info == CLOG_ZEROPAGE)
993 : {
994 : int pageno;
995 EUB : int slotno;
996 :
997 UBC 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(int));
998 :
999 0 : LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
1000 EUB :
1001 UBC 0 : slotno = ZeroCLOGPage(pageno, false);
1002 UIC 0 : SimpleLruWritePage(XactCtl, slotno);
1003 UBC 0 : Assert(!XactCtl->shared->page_dirty[slotno]);
1004 :
1005 0 : LWLockRelease(XactSLRULock);
1006 : }
1007 UIC 0 : else if (info == CLOG_TRUNCATE)
1008 : {
1009 EUB : xl_clog_truncate xlrec;
1010 :
1011 UBC 0 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
1012 :
1013 0 : AdvanceOldestClogXid(xlrec.oldestXact);
1014 :
1015 UIC 0 : SimpleLruTruncate(XactCtl, xlrec.pageno);
1016 EUB : }
1017 : else
1018 UIC 0 : elog(PANIC, "clog_redo: unknown op code %u", info);
1019 0 : }
1020 :
1021 : /*
1022 : * Entrypoint for sync.c to sync clog files.
1023 EUB : */
1024 : int
1025 UBC 0 : clogsyncfiletag(const FileTag *ftag, char *path)
1026 : {
1027 UIC 0 : return SlruSyncFileTag(XactCtl, ftag, path);
1028 : }
|