TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * generic_xlog.c
4 : * Implementation of generic xlog records.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * src/backend/access/transam/generic_xlog.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/bufmask.h"
17 : #include "access/generic_xlog.h"
18 : #include "access/xlogutils.h"
19 : #include "miscadmin.h"
20 : #include "utils/memutils.h"
21 :
22 : /*-------------------------------------------------------------------------
23 : * Internally, a delta between pages consists of a set of fragments. Each
24 : * fragment represents changes made in a given region of a page. A fragment
25 : * is made up as follows:
26 : *
27 : * - offset of page region (OffsetNumber)
28 : * - length of page region (OffsetNumber)
29 : * - data - the data to place into the region ('length' number of bytes)
30 : *
31 : * Unchanged regions of a page are not represented in its delta. As a result,
32 : * a delta can be more compact than the full page image. But having an
33 : * unchanged region between two fragments that is smaller than the fragment
34 : * header (offset+length) does not pay off in terms of the overall size of
35 : * the delta. For this reason, we merge adjacent fragments if the unchanged
36 : * region between them is <= MATCH_THRESHOLD bytes.
37 : *
38 : * We do not bother to merge fragments across the "lower" and "upper" parts
39 : * of a page; it's very seldom the case that pd_lower and pd_upper are within
40 : * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
41 : * would complicate and slow down the delta-computation code unduly.
42 : * Therefore, the worst-case delta size includes two fragment headers plus
43 : * a full page's worth of data.
44 : *-------------------------------------------------------------------------
45 : */
46 : #define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
47 : #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
48 : #define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
49 :
50 : /* Struct of generic xlog data for single page */
51 : typedef struct
52 : {
53 : Buffer buffer; /* registered buffer */
54 : int flags; /* flags for this buffer */
55 : int deltaLen; /* space consumed in delta field */
56 : char *image; /* copy of page image for modification, do not
57 : * do it in-place to have aligned memory chunk */
58 : char delta[MAX_DELTA_SIZE]; /* delta between page images */
59 : } PageData;
60 :
61 : /*
62 : * State of generic xlog record construction. Must be allocated at an I/O
63 : * aligned address.
64 : */
65 : struct GenericXLogState
66 : {
67 : /* Page images (properly aligned, must be first) */
68 : PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
69 : /* Info about each page, see above */
70 : PageData pages[MAX_GENERIC_XLOG_PAGES];
71 : bool isLogged;
72 : };
73 :
74 : static void writeFragment(PageData *pageData, OffsetNumber offset,
75 : OffsetNumber length, const char *data);
76 : static void computeRegionDelta(PageData *pageData,
77 : const char *curpage, const char *targetpage,
78 : int targetStart, int targetEnd,
79 : int validStart, int validEnd);
80 : static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
81 : static void applyPageRedo(Page page, const char *delta, Size deltaSize);
82 :
83 :
84 : /*
85 : * Write next fragment into pageData's delta.
86 : *
87 : * The fragment has the given offset and length, and data points to the
88 : * actual data (of length length).
89 : */
90 : static void
91 GIC 579357 : writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
92 : const char *data)
93 : {
94 CBC 579357 : char *ptr = pageData->delta + pageData->deltaLen;
95 :
96 : /* Verify we have enough space */
97 579357 : Assert(pageData->deltaLen + sizeof(offset) +
98 : sizeof(length) + length <= sizeof(pageData->delta));
99 :
100 ECB : /* Write fragment data */
101 GIC 579357 : memcpy(ptr, &offset, sizeof(offset));
102 579357 : ptr += sizeof(offset);
103 579357 : memcpy(ptr, &length, sizeof(length));
104 CBC 579357 : ptr += sizeof(length);
105 579357 : memcpy(ptr, data, length);
106 579357 : ptr += length;
107 ECB :
108 CBC 579357 : pageData->deltaLen = ptr - pageData->delta;
109 579357 : }
110 :
111 ECB : /*
112 : * Compute the XLOG fragments needed to transform a region of curpage into the
113 : * corresponding region of targetpage, and append them to pageData's delta
114 : * field. The region to transform runs from targetStart to targetEnd-1.
115 : * Bytes in curpage outside the range validStart to validEnd-1 should be
116 : * considered invalid, and always overwritten with target data.
117 : *
118 : * This function is a hot spot, so it's worth being as tense as possible
119 : * about the data-matching loops.
120 : */
121 : static void
122 GIC 214150 : computeRegionDelta(PageData *pageData,
123 : const char *curpage, const char *targetpage,
124 : int targetStart, int targetEnd,
125 ECB : int validStart, int validEnd)
126 : {
127 : int i,
128 : loopEnd,
129 GIC 214150 : fragmentBegin = -1,
130 214150 : fragmentEnd = -1;
131 :
132 ECB : /* Deal with any invalid start region by including it in first fragment */
133 CBC 214150 : if (validStart > targetStart)
134 : {
135 UIC 0 : fragmentBegin = targetStart;
136 LBC 0 : targetStart = validStart;
137 : }
138 EUB :
139 : /* We'll deal with any invalid end region after the main loop */
140 GIC 214150 : loopEnd = Min(targetEnd, validEnd);
141 :
142 : /* Examine all the potentially matchable bytes */
143 CBC 214150 : i = targetStart;
144 GIC 3351954 : while (i < loopEnd)
145 : {
146 CBC 3138966 : if (curpage[i] != targetpage[i])
147 ECB : {
148 : /* On unmatched byte, start new fragment if not already in one */
149 CBC 3030592 : if (fragmentBegin < 0)
150 GIC 475362 : fragmentBegin = i;
151 : /* Mark unmatched-data endpoint as uncertain */
152 CBC 3030592 : fragmentEnd = -1;
153 ECB : /* Extend the fragment as far as possible in a tight loop */
154 GIC 3030592 : i++;
155 CBC 5093436 : while (i < loopEnd && curpage[i] != targetpage[i])
156 GIC 2062844 : i++;
157 CBC 3030592 : if (i >= loopEnd)
158 1162 : break;
159 ECB : }
160 :
161 : /* Found a matched byte, so remember end of unmatched fragment */
162 GIC 3137804 : fragmentEnd = i;
163 :
164 : /*
165 ECB : * Extend the match as far as possible in a tight loop. (On typical
166 : * workloads, this inner loop is the bulk of this function's runtime.)
167 : */
168 GIC 3137804 : i++;
169 775485060 : while (i < loopEnd && curpage[i] == targetpage[i])
170 772347256 : i++;
171 ECB :
172 : /*
173 : * There are several possible cases at this point:
174 : *
175 : * 1. We have no unwritten fragment (fragmentBegin < 0). There's
176 : * nothing to write; and it doesn't matter what fragmentEnd is.
177 : *
178 : * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
179 : * Dump out the unwritten fragment, stopping at fragmentEnd.
180 : *
181 : * 3. The match extends to loopEnd. We'll do nothing here, exit the
182 : * loop, and then dump the unwritten fragment, after merging it with
183 : * the invalid end region if any. If we don't so merge, fragmentEnd
184 : * establishes how much the final writeFragment call needs to write.
185 : *
186 : * 4. We found an unmatched byte before loopEnd. The loop will repeat
187 : * and will enter the unmatched-byte stanza above. So in this case
188 : * also, it doesn't matter what fragmentEnd is. The matched bytes
189 : * will get merged into the continuing unmatched fragment.
190 : *
191 : * Only in case 3 do we reach the bottom of the loop with a meaningful
192 : * fragmentEnd value, which is why it's OK that we unconditionally
193 : * assign "fragmentEnd = i" above.
194 : */
195 GIC 3137804 : if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD)
196 : {
197 473596 : writeFragment(pageData, fragmentBegin,
198 CBC 473596 : fragmentEnd - fragmentBegin,
199 : targetpage + fragmentBegin);
200 473596 : fragmentBegin = -1;
201 473596 : fragmentEnd = -1; /* not really necessary */
202 : }
203 ECB : }
204 :
205 : /* Deal with any invalid end region by including it in final fragment */
206 GIC 214150 : if (loopEnd < targetEnd)
207 : {
208 103995 : if (fragmentBegin < 0)
209 CBC 103995 : fragmentBegin = loopEnd;
210 GIC 103995 : fragmentEnd = targetEnd;
211 ECB : }
212 :
213 : /* Write final fragment if any */
214 GIC 214150 : if (fragmentBegin >= 0)
215 : {
216 105761 : if (fragmentEnd < 0)
217 CBC 1162 : fragmentEnd = targetEnd;
218 GIC 105761 : writeFragment(pageData, fragmentBegin,
219 CBC 105761 : fragmentEnd - fragmentBegin,
220 ECB : targetpage + fragmentBegin);
221 : }
222 CBC 214150 : }
223 :
224 : /*
225 ECB : * Compute the XLOG delta record needed to transform curpage into targetpage,
226 : * and store it in pageData's delta field.
227 : */
228 : static void
229 GIC 107075 : computeDelta(PageData *pageData, Page curpage, Page targetpage)
230 : {
231 107075 : int targetLower = ((PageHeader) targetpage)->pd_lower,
232 CBC 107075 : targetUpper = ((PageHeader) targetpage)->pd_upper,
233 GIC 107075 : curLower = ((PageHeader) curpage)->pd_lower,
234 CBC 107075 : curUpper = ((PageHeader) curpage)->pd_upper;
235 ECB :
236 CBC 107075 : pageData->deltaLen = 0;
237 ECB :
238 : /* Compute delta records for lower part of page ... */
239 CBC 107075 : computeRegionDelta(pageData, curpage, targetpage,
240 : 0, targetLower,
241 : 0, curLower);
242 ECB : /* ... and for upper part, ignoring what's between */
243 GIC 107075 : computeRegionDelta(pageData, curpage, targetpage,
244 : targetUpper, BLCKSZ,
245 : curUpper, BLCKSZ);
246 ECB :
247 : /*
248 : * If xlog debug is enabled, then check produced delta. Result of delta
249 : * application to curpage should be equivalent to targetpage.
250 : */
251 : #ifdef WAL_DEBUG
252 : if (XLOG_DEBUG)
253 : {
254 : PGAlignedBlock tmp;
255 :
256 : memcpy(tmp.data, curpage, BLCKSZ);
257 : applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen);
258 : if (memcmp(tmp.data, targetpage, targetLower) != 0 ||
259 : memcmp(tmp.data + targetUpper, targetpage + targetUpper,
260 : BLCKSZ - targetUpper) != 0)
261 : elog(ERROR, "result of generic xlog apply does not match");
262 : }
263 : #endif
264 GIC 107075 : }
265 :
266 : /*
267 ECB : * Start new generic xlog record for modifications to specified relation.
268 : */
269 : GenericXLogState *
270 GIC 107305 : GenericXLogStart(Relation relation)
271 : {
272 : GenericXLogState *state;
273 ECB : int i;
274 :
275 GNC 107305 : state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
276 : PG_IO_ALIGN_SIZE,
277 : 0);
278 GIC 107305 : state->isLogged = RelationNeedsWAL(relation);
279 :
280 CBC 536525 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
281 : {
282 GIC 429220 : state->pages[i].image = state->images[i].data;
283 CBC 429220 : state->pages[i].buffer = InvalidBuffer;
284 : }
285 ECB :
286 GIC 107305 : return state;
287 ECB : }
288 :
289 : /*
290 : * Register new buffer for generic xlog record.
291 : *
292 : * Returns pointer to the page's image in the GenericXLogState, which
293 : * is what the caller should modify.
294 : *
295 : * If the buffer is already registered, just return its existing entry.
296 : * (It's not very clear what to do with the flags in such a case, but
297 : * for now we stay with the original flags.)
298 : */
299 : Page
300 GIC 108593 : GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
301 : {
302 : int block_id;
303 :
304 : /* Search array for existing entry or first unused slot */
305 CBC 109881 : for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++)
306 : {
307 GIC 109881 : PageData *page = &state->pages[block_id];
308 :
309 109881 : if (BufferIsInvalid(page->buffer))
310 ECB : {
311 : /* Empty slot, so use it (there cannot be a match later) */
312 CBC 108593 : page->buffer = buffer;
313 GIC 108593 : page->flags = flags;
314 CBC 108593 : memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
315 GIC 108593 : return (Page) page->image;
316 : }
317 CBC 1288 : else if (page->buffer == buffer)
318 ECB : {
319 : /*
320 : * Buffer is already registered. Just return the image, which is
321 : * already prepared.
322 : */
323 UIC 0 : return (Page) page->image;
324 : }
325 : }
326 :
327 0 : elog(ERROR, "maximum number %d of generic xlog buffers is exceeded",
328 EUB : MAX_GENERIC_XLOG_PAGES);
329 : /* keep compiler quiet */
330 : return NULL;
331 : }
332 :
333 : /*
334 : * Apply changes represented by GenericXLogState to the actual buffers,
335 : * and emit a generic xlog record.
336 : */
337 : XLogRecPtr
338 GIC 106010 : GenericXLogFinish(GenericXLogState *state)
339 : {
340 : XLogRecPtr lsn;
341 : int i;
342 :
343 CBC 106010 : if (state->isLogged)
344 : {
345 : /* Logged relation: make xlog record in critical section. */
346 GIC 106005 : XLogBeginInsert();
347 :
348 CBC 106005 : START_CRIT_SECTION();
349 :
350 GIC 530025 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
351 ECB : {
352 GIC 424020 : PageData *pageData = &state->pages[i];
353 ECB : Page page;
354 : PageHeader pageHeader;
355 :
356 GIC 424020 : if (BufferIsInvalid(pageData->buffer))
357 CBC 316727 : continue;
358 :
359 GIC 107293 : page = BufferGetPage(pageData->buffer);
360 107293 : pageHeader = (PageHeader) pageData->image;
361 ECB :
362 CBC 107293 : if (pageData->flags & GENERIC_XLOG_FULL_IMAGE)
363 : {
364 ECB : /*
365 : * A full-page image does not require us to supply any xlog
366 : * data. Just apply the image, being careful to zero the
367 : * "hole" between pd_lower and pd_upper in order to avoid
368 : * divergence between actual page state and what replay would
369 : * produce.
370 : */
371 GIC 218 : memcpy(page, pageData->image, pageHeader->pd_lower);
372 218 : memset(page + pageHeader->pd_lower, 0,
373 218 : pageHeader->pd_upper - pageHeader->pd_lower);
374 218 : memcpy(page + pageHeader->pd_upper,
375 218 : pageData->image + pageHeader->pd_upper,
376 CBC 218 : BLCKSZ - pageHeader->pd_upper);
377 ECB :
378 CBC 218 : XLogRegisterBuffer(i, pageData->buffer,
379 ECB : REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
380 : }
381 : else
382 : {
383 : /*
384 : * In normal mode, calculate delta and write it as xlog data
385 : * associated with this page.
386 : */
387 GIC 107075 : computeDelta(pageData, page, (Page) pageData->image);
388 :
389 : /* Apply the image, with zeroed "hole" as above */
390 107075 : memcpy(page, pageData->image, pageHeader->pd_lower);
391 107075 : memset(page + pageHeader->pd_lower, 0,
392 CBC 107075 : pageHeader->pd_upper - pageHeader->pd_lower);
393 GIC 107075 : memcpy(page + pageHeader->pd_upper,
394 107075 : pageData->image + pageHeader->pd_upper,
395 CBC 107075 : BLCKSZ - pageHeader->pd_upper);
396 ECB :
397 CBC 107075 : XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD);
398 107075 : XLogRegisterBufData(i, pageData->delta, pageData->deltaLen);
399 ECB : }
400 : }
401 :
402 : /* Insert xlog record */
403 CBC 106005 : lsn = XLogInsert(RM_GENERIC_ID, 0);
404 :
405 : /* Set LSN and mark buffers dirty */
406 GIC 530025 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
407 : {
408 CBC 424020 : PageData *pageData = &state->pages[i];
409 :
410 GIC 424020 : if (BufferIsInvalid(pageData->buffer))
411 CBC 316727 : continue;
412 GIC 107293 : PageSetLSN(BufferGetPage(pageData->buffer), lsn);
413 CBC 107293 : MarkBufferDirty(pageData->buffer);
414 : }
415 106005 : END_CRIT_SECTION();
416 ECB : }
417 : else
418 : {
419 : /* Unlogged relation: skip xlog-related stuff */
420 CBC 5 : START_CRIT_SECTION();
421 GIC 25 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
422 : {
423 20 : PageData *pageData = &state->pages[i];
424 :
425 CBC 20 : if (BufferIsInvalid(pageData->buffer))
426 15 : continue;
427 GIC 10 : memcpy(BufferGetPage(pageData->buffer),
428 CBC 5 : pageData->image,
429 : BLCKSZ);
430 ECB : /* We don't worry about zeroing the "hole" in this case */
431 CBC 5 : MarkBufferDirty(pageData->buffer);
432 ECB : }
433 CBC 5 : END_CRIT_SECTION();
434 : /* We don't have a LSN to return, in this case */
435 GIC 5 : lsn = InvalidXLogRecPtr;
436 ECB : }
437 :
438 CBC 106010 : pfree(state);
439 :
440 106010 : return lsn;
441 : }
442 :
443 ECB : /*
444 : * Abort generic xlog record construction. No changes are applied to buffers.
445 : *
446 : * Note: caller is responsible for releasing locks/pins on buffers, if needed.
447 : */
448 : void
449 GIC 1295 : GenericXLogAbort(GenericXLogState *state)
450 : {
451 1295 : pfree(state);
452 1295 : }
453 :
454 ECB : /*
455 : * Apply delta to given page image.
456 : */
457 : static void
458 UIC 0 : applyPageRedo(Page page, const char *delta, Size deltaSize)
459 : {
460 0 : const char *ptr = delta;
461 0 : const char *end = delta + deltaSize;
462 :
463 UBC 0 : while (ptr < end)
464 : {
465 EUB : OffsetNumber offset,
466 : length;
467 :
468 UBC 0 : memcpy(&offset, ptr, sizeof(offset));
469 UIC 0 : ptr += sizeof(offset);
470 0 : memcpy(&length, ptr, sizeof(length));
471 0 : ptr += sizeof(length);
472 :
473 UBC 0 : memcpy(page + offset, ptr, length);
474 EUB :
475 UBC 0 : ptr += length;
476 EUB : }
477 UIC 0 : }
478 EUB :
479 : /*
480 : * Redo function for generic xlog record.
481 : */
482 : void
483 UIC 0 : generic_redo(XLogReaderState *record)
484 : {
485 0 : XLogRecPtr lsn = record->EndRecPtr;
486 : Buffer buffers[MAX_GENERIC_XLOG_PAGES];
487 : uint8 block_id;
488 EUB :
489 : /* Protect limited size of buffers[] array */
490 UBC 0 : Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES);
491 :
492 : /* Iterate over blocks */
493 UIC 0 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
494 : {
495 EUB : XLogRedoAction action;
496 :
497 UIC 0 : if (!XLogRecHasBlockRef(record, block_id))
498 EUB : {
499 UIC 0 : buffers[block_id] = InvalidBuffer;
500 0 : continue;
501 : }
502 EUB :
503 UIC 0 : action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]);
504 EUB :
505 : /* Apply redo to given block if needed */
506 UIC 0 : if (action == BLK_NEEDS_REDO)
507 : {
508 EUB : Page page;
509 : PageHeader pageHeader;
510 : char *blockDelta;
511 : Size blockDeltaSize;
512 :
513 UIC 0 : page = BufferGetPage(buffers[block_id]);
514 0 : blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize);
515 0 : applyPageRedo(page, blockDelta, blockDeltaSize);
516 :
517 : /*
518 EUB : * Since the delta contains no information about what's in the
519 : * "hole" between pd_lower and pd_upper, set that to zero to
520 : * ensure we produce the same page state that application of the
521 : * logged action by GenericXLogFinish did.
522 : */
523 UIC 0 : pageHeader = (PageHeader) page;
524 0 : memset(page + pageHeader->pd_lower, 0,
525 0 : pageHeader->pd_upper - pageHeader->pd_lower);
526 :
527 0 : PageSetLSN(page, lsn);
528 UBC 0 : MarkBufferDirty(buffers[block_id]);
529 EUB : }
530 : }
531 :
532 : /* Changes are done: unlock and release all buffers */
533 UBC 0 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
534 : {
535 UIC 0 : if (BufferIsValid(buffers[block_id]))
536 0 : UnlockReleaseBuffer(buffers[block_id]);
537 : }
538 UBC 0 : }
539 :
540 EUB : /*
541 : * Mask a generic page before performing consistency checks on it.
542 : */
543 : void
544 UIC 0 : generic_mask(char *page, BlockNumber blkno)
545 : {
546 0 : mask_page_lsn_and_checksum(page);
547 :
548 0 : mask_unused_space(page);
549 UBC 0 : }
|