Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * logtape.c
4 : : * Management of "logical tapes" within temporary files.
5 : : *
6 : : * This module exists to support sorting via multiple merge passes (see
7 : : * tuplesort.c). Merging is an ideal algorithm for tape devices, but if
8 : : * we implement it on disk by creating a separate file for each "tape",
9 : : * there is an annoying problem: the peak space usage is at least twice
10 : : * the volume of actual data to be sorted. (This must be so because each
11 : : * datum will appear in both the input and output tapes of the final
12 : : * merge pass.)
13 : : *
14 : : * We can work around this problem by recognizing that any one tape
15 : : * dataset (with the possible exception of the final output) is written
16 : : * and read exactly once in a perfectly sequential manner. Therefore,
17 : : * a datum once read will not be required again, and we can recycle its
18 : : * space for use by the new tape dataset(s) being generated. In this way,
19 : : * the total space usage is essentially just the actual data volume, plus
20 : : * insignificant bookkeeping and start/stop overhead.
21 : : *
22 : : * Few OSes allow arbitrary parts of a file to be released back to the OS,
23 : : * so we have to implement this space-recycling ourselves within a single
24 : : * logical file. logtape.c exists to perform this bookkeeping and provide
25 : : * the illusion of N independent tape devices to tuplesort.c. Note that
26 : : * logtape.c itself depends on buffile.c to provide a "logical file" of
27 : : * larger size than the underlying OS may support.
28 : : *
29 : : * For simplicity, we allocate and release space in the underlying file
30 : : * in BLCKSZ-size blocks. Space allocation boils down to keeping track
31 : : * of which blocks in the underlying file belong to which logical tape,
32 : : * plus any blocks that are free (recycled and not yet reused).
33 : : * The blocks in each logical tape form a chain, with a prev- and next-
34 : : * pointer in each block.
35 : : *
36 : : * The initial write pass is guaranteed to fill the underlying file
37 : : * perfectly sequentially, no matter how data is divided into logical tapes.
38 : : * Once we begin merge passes, the access pattern becomes considerably
39 : : * less predictable --- but the seeking involved should be comparable to
40 : : * what would happen if we kept each logical tape in a separate file,
41 : : * so there's no serious performance penalty paid to obtain the space
42 : : * savings of recycling. We try to localize the write accesses by always
43 : : * writing to the lowest-numbered free block when we have a choice; it's
44 : : * not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
45 : : * policy for free blocks would be better?)
46 : : *
47 : : * To further make the I/Os more sequential, we can use a larger buffer
48 : : * when reading, and read multiple blocks from the same tape in one go,
49 : : * whenever the buffer becomes empty.
50 : : *
51 : : * To support the above policy of writing to the lowest free block, the
52 : : * freelist is a min heap.
53 : : *
54 : : * Since all the bookkeeping and buffer memory is allocated with palloc(),
55 : : * and the underlying file(s) are made with OpenTemporaryFile, all resources
56 : : * for a logical tape set are certain to be cleaned up even if processing
57 : : * is aborted by ereport(ERROR). To avoid confusion, the caller should take
58 : : * care that all calls for a single LogicalTapeSet are made in the same
59 : : * palloc context.
60 : : *
61 : : * To support parallel sort operations involving coordinated callers to
62 : : * tuplesort.c routines across multiple workers, it is necessary to
63 : : * concatenate each worker BufFile/tapeset into one single logical tapeset
64 : : * managed by the leader. Workers should have produced one final
65 : : * materialized tape (their entire output) when this happens in leader.
66 : : * There will always be the same number of runs as input tapes, and the same
67 : : * number of input tapes as participants (worker Tuplesortstates).
68 : : *
69 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
70 : : * Portions Copyright (c) 1994, Regents of the University of California
71 : : *
72 : : * IDENTIFICATION
73 : : * src/backend/utils/sort/logtape.c
74 : : *
75 : : *-------------------------------------------------------------------------
76 : : */
77 : :
78 : : #include "postgres.h"
79 : :
80 : : #include <fcntl.h>
81 : :
82 : : #include "storage/buffile.h"
83 : : #include "utils/builtins.h"
84 : : #include "utils/logtape.h"
85 : : #include "utils/memdebug.h"
86 : : #include "utils/memutils.h"
87 : :
88 : : /*
89 : : * A TapeBlockTrailer is stored at the end of each BLCKSZ block.
90 : : *
91 : : * The first block of a tape has prev == -1. The last block of a tape
92 : : * stores the number of valid bytes on the block, inverted, in 'next'
93 : : * Therefore next < 0 indicates the last block.
94 : : */
95 : : typedef struct TapeBlockTrailer
96 : : {
97 : : int64 prev; /* previous block on this tape, or -1 on first
98 : : * block */
99 : : int64 next; /* next block on this tape, or # of valid
100 : : * bytes on last block (if < 0) */
101 : : } TapeBlockTrailer;
102 : :
103 : : #define TapeBlockPayloadSize (BLCKSZ - sizeof(TapeBlockTrailer))
104 : : #define TapeBlockGetTrailer(buf) \
105 : : ((TapeBlockTrailer *) ((char *) buf + TapeBlockPayloadSize))
106 : :
107 : : #define TapeBlockIsLast(buf) (TapeBlockGetTrailer(buf)->next < 0)
108 : : #define TapeBlockGetNBytes(buf) \
109 : : (TapeBlockIsLast(buf) ? \
110 : : (- TapeBlockGetTrailer(buf)->next) : TapeBlockPayloadSize)
111 : : #define TapeBlockSetNBytes(buf, nbytes) \
112 : : (TapeBlockGetTrailer(buf)->next = -(nbytes))
113 : :
114 : : /*
115 : : * When multiple tapes are being written to concurrently (as in HashAgg),
116 : : * avoid excessive fragmentation by preallocating block numbers to individual
117 : : * tapes. Each preallocation doubles in size starting at
118 : : * TAPE_WRITE_PREALLOC_MIN blocks up to TAPE_WRITE_PREALLOC_MAX blocks.
119 : : *
120 : : * No filesystem operations are performed for preallocation; only the block
121 : : * numbers are reserved. This may lead to sparse writes, which will cause
122 : : * ltsWriteBlock() to fill in holes with zeros.
123 : : */
124 : : #define TAPE_WRITE_PREALLOC_MIN 8
125 : : #define TAPE_WRITE_PREALLOC_MAX 128
126 : :
127 : : /*
128 : : * This data structure represents a single "logical tape" within the set
129 : : * of logical tapes stored in the same file.
130 : : *
131 : : * While writing, we hold the current partially-written data block in the
132 : : * buffer. While reading, we can hold multiple blocks in the buffer. Note
133 : : * that we don't retain the trailers of a block when it's read into the
134 : : * buffer. The buffer therefore contains one large contiguous chunk of data
135 : : * from the tape.
136 : : */
137 : : struct LogicalTape
138 : : {
139 : : LogicalTapeSet *tapeSet; /* tape set this tape is part of */
140 : :
141 : : bool writing; /* T while in write phase */
142 : : bool frozen; /* T if blocks should not be freed when read */
143 : : bool dirty; /* does buffer need to be written? */
144 : :
145 : : /*
146 : : * Block numbers of the first, current, and next block of the tape.
147 : : *
148 : : * The "current" block number is only valid when writing, or reading from
149 : : * a frozen tape. (When reading from an unfrozen tape, we use a larger
150 : : * read buffer that holds multiple blocks, so the "current" block is
151 : : * ambiguous.)
152 : : *
153 : : * When concatenation of worker tape BufFiles is performed, an offset to
154 : : * the first block in the unified BufFile space is applied during reads.
155 : : */
156 : : int64 firstBlockNumber;
157 : : int64 curBlockNumber;
158 : : int64 nextBlockNumber;
159 : : int64 offsetBlockNumber;
160 : :
161 : : /*
162 : : * Buffer for current data block(s).
163 : : */
164 : : char *buffer; /* physical buffer (separately palloc'd) */
165 : : int buffer_size; /* allocated size of the buffer */
166 : : int max_size; /* highest useful, safe buffer_size */
167 : : int pos; /* next read/write position in buffer */
168 : : int nbytes; /* total # of valid bytes in buffer */
169 : :
170 : : /*
171 : : * Preallocated block numbers are held in an array sorted in descending
172 : : * order; blocks are consumed from the end of the array (lowest block
173 : : * numbers first).
174 : : */
175 : : int64 *prealloc;
176 : : int nprealloc; /* number of elements in list */
177 : : int prealloc_size; /* number of elements list can hold */
178 : : };
179 : :
180 : : /*
181 : : * This data structure represents a set of related "logical tapes" sharing
182 : : * space in a single underlying file. (But that "file" may be multiple files
183 : : * if needed to escape OS limits on file size; buffile.c handles that for us.)
184 : : * Tapes belonging to a tape set can be created and destroyed on-the-fly, on
185 : : * demand.
186 : : */
187 : : struct LogicalTapeSet
188 : : {
189 : : BufFile *pfile; /* underlying file for whole tape set */
190 : : SharedFileSet *fileset;
191 : : int worker; /* worker # if shared, -1 for leader/serial */
192 : :
193 : : /*
194 : : * File size tracking. nBlocksWritten is the size of the underlying file,
195 : : * in BLCKSZ blocks. nBlocksAllocated is the number of blocks allocated
196 : : * by ltsReleaseBlock(), and it is always greater than or equal to
197 : : * nBlocksWritten. Blocks between nBlocksAllocated and nBlocksWritten are
198 : : * blocks that have been allocated for a tape, but have not been written
199 : : * to the underlying file yet. nHoleBlocks tracks the total number of
200 : : * blocks that are in unused holes between worker spaces following BufFile
201 : : * concatenation.
202 : : */
203 : : int64 nBlocksAllocated; /* # of blocks allocated */
204 : : int64 nBlocksWritten; /* # of blocks used in underlying file */
205 : : int64 nHoleBlocks; /* # of "hole" blocks left */
206 : :
207 : : /*
208 : : * We store the numbers of recycled-and-available blocks in freeBlocks[].
209 : : * When there are no such blocks, we extend the underlying file.
210 : : *
211 : : * If forgetFreeSpace is true then any freed blocks are simply forgotten
212 : : * rather than being remembered in freeBlocks[]. See notes for
213 : : * LogicalTapeSetForgetFreeSpace().
214 : : */
215 : : bool forgetFreeSpace; /* are we remembering free blocks? */
216 : : int64 *freeBlocks; /* resizable array holding minheap */
217 : : int64 nFreeBlocks; /* # of currently free blocks */
218 : : Size freeBlocksLen; /* current allocated length of freeBlocks[] */
219 : : bool enable_prealloc; /* preallocate write blocks? */
220 : : };
221 : :
222 : : static LogicalTape *ltsCreateTape(LogicalTapeSet *lts);
223 : : static void ltsWriteBlock(LogicalTapeSet *lts, int64 blocknum, const void *buffer);
224 : : static void ltsReadBlock(LogicalTapeSet *lts, int64 blocknum, void *buffer);
225 : : static int64 ltsGetBlock(LogicalTapeSet *lts, LogicalTape *lt);
226 : : static int64 ltsGetFreeBlock(LogicalTapeSet *lts);
227 : : static int64 ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
228 : : static void ltsReleaseBlock(LogicalTapeSet *lts, int64 blocknum);
229 : : static void ltsInitReadBuffer(LogicalTape *lt);
230 : :
231 : :
232 : : /*
233 : : * Write a block-sized buffer to the specified block of the underlying file.
234 : : *
235 : : * No need for an error return convention; we ereport() on any error.
236 : : */
237 : : static void
149 michael@paquier.xyz 238 :GNC 25567 : ltsWriteBlock(LogicalTapeSet *lts, int64 blocknum, const void *buffer)
239 : : {
240 : : /*
241 : : * BufFile does not support "holes", so if we're about to write a block
242 : : * that's past the current end of file, fill the space between the current
243 : : * end of file and the target block with zeros.
244 : : *
245 : : * This can happen either when tapes preallocate blocks; or for the last
246 : : * block of a tape which might not have been flushed.
247 : : *
248 : : * Note that BufFile concatenation can leave "holes" in BufFile between
249 : : * worker-owned block ranges. These are tracked for reporting purposes
250 : : * only. We never read from nor write to these hole blocks, and so they
251 : : * are not considered here.
252 : : */
2629 heikki.linnakangas@i 253 [ + + ]:CBC 27691 : while (blocknum > lts->nBlocksWritten)
254 : : {
255 : : PGIOAlignedBlock zerobuf;
256 : :
2052 tgl@sss.pgh.pa.us 257 [ + - + - : 2124 : MemSet(zerobuf.data, 0, sizeof(zerobuf));
+ - - + -
- ]
258 : :
259 : 2124 : ltsWriteBlock(lts, lts->nBlocksWritten, zerobuf.data);
260 : : }
261 : :
262 : : /* Write the requested block */
1398 tmunro@postgresql.or 263 [ - + ]: 25567 : if (BufFileSeekBlock(lts->pfile, blocknum) != 0)
7569 tgl@sss.pgh.pa.us 264 [ # # ]:UBC 0 : ereport(ERROR,
265 : : (errcode_for_file_access(),
266 : : errmsg("could not seek to block %lld of temporary file",
267 : : (long long) blocknum)));
1398 tmunro@postgresql.or 268 :CBC 25567 : BufFileWrite(lts->pfile, buffer, BLCKSZ);
269 : :
270 : : /* Update nBlocksWritten, if we extended the file */
2629 heikki.linnakangas@i 271 [ + + ]: 25567 : if (blocknum == lts->nBlocksWritten)
272 : 9529 : lts->nBlocksWritten++;
8947 tgl@sss.pgh.pa.us 273 : 25567 : }
274 : :
275 : : /*
276 : : * Read a block-sized buffer from the specified block of the underlying file.
277 : : *
278 : : * No need for an error return convention; we ereport() on any error. This
279 : : * module should never attempt to read a block it doesn't know is there.
280 : : */
281 : : static void
149 michael@paquier.xyz 282 :GNC 23286 : ltsReadBlock(LogicalTapeSet *lts, int64 blocknum, void *buffer)
283 : : {
1398 tmunro@postgresql.or 284 [ - + ]:CBC 23286 : if (BufFileSeekBlock(lts->pfile, blocknum) != 0)
7569 tgl@sss.pgh.pa.us 285 [ # # ]:UBC 0 : ereport(ERROR,
286 : : (errcode_for_file_access(),
287 : : errmsg("could not seek to block %lld of temporary file",
288 : : (long long) blocknum)));
454 peter@eisentraut.org 289 :CBC 23286 : BufFileReadExact(lts->pfile, buffer, BLCKSZ);
8947 tgl@sss.pgh.pa.us 290 : 23286 : }
291 : :
292 : : /*
293 : : * Read as many blocks as we can into the per-tape buffer.
294 : : *
295 : : * Returns true if anything was read, 'false' on EOF.
296 : : */
297 : : static bool
909 heikki.linnakangas@i 298 : 30407 : ltsReadFillBuffer(LogicalTape *lt)
299 : : {
2750 300 : 30407 : lt->pos = 0;
301 : 30407 : lt->nbytes = 0;
302 : :
303 : : do
304 : : {
2670 305 : 36534 : char *thisbuf = lt->buffer + lt->nbytes;
149 michael@paquier.xyz 306 :GNC 36534 : int64 datablocknum = lt->nextBlockNumber;
307 : :
308 : : /* Fetch next block number */
2263 rhaas@postgresql.org 309 [ + + ]:CBC 36534 : if (datablocknum == -1L)
2670 heikki.linnakangas@i 310 : 13506 : break; /* EOF */
311 : : /* Apply worker offset, needed for leader tapesets */
2263 rhaas@postgresql.org 312 : 23028 : datablocknum += lt->offsetBlockNumber;
313 : :
314 : : /* Read the block */
471 peter@eisentraut.org 315 : 23028 : ltsReadBlock(lt->tapeSet, datablocknum, thisbuf);
2750 heikki.linnakangas@i 316 [ + + ]: 23028 : if (!lt->frozen)
909 317 : 22677 : ltsReleaseBlock(lt->tapeSet, datablocknum);
2670 318 : 23028 : lt->curBlockNumber = lt->nextBlockNumber;
319 : :
320 [ + + ]: 23028 : lt->nbytes += TapeBlockGetNBytes(thisbuf);
321 [ + + ]: 23028 : if (TapeBlockIsLast(thisbuf))
322 : : {
323 : 13934 : lt->nextBlockNumber = -1L;
324 : : /* EOF */
2750 325 : 13934 : break;
326 : : }
327 : : else
2670 328 : 9094 : lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next;
329 : :
330 : : /* Advance to next block, if we have buffer space left */
331 [ + + ]: 9094 : } while (lt->buffer_size - lt->nbytes > BLCKSZ);
332 : :
2750 333 : 30407 : return (lt->nbytes > 0);
334 : : }
335 : :
336 : : static inline uint64
149 michael@paquier.xyz 337 :GNC 760116 : left_offset(uint64 i)
338 : : {
1529 jdavis@postgresql.or 339 :CBC 760116 : return 2 * i + 1;
340 : : }
341 : :
342 : : static inline uint64
149 michael@paquier.xyz 343 :GNC 760116 : right_offset(uint64 i)
344 : : {
1529 jdavis@postgresql.or 345 :CBC 760116 : return 2 * i + 2;
346 : : }
347 : :
348 : : static inline uint64
149 michael@paquier.xyz 349 :GNC 476838 : parent_offset(uint64 i)
350 : : {
1529 jdavis@postgresql.or 351 :CBC 476838 : return (i - 1) / 2;
352 : : }
353 : :
354 : : /*
355 : : * Get the next block for writing.
356 : : */
357 : : static int64
1311 358 : 23443 : ltsGetBlock(LogicalTapeSet *lts, LogicalTape *lt)
359 : : {
360 [ + + ]: 23443 : if (lts->enable_prealloc)
361 : 14037 : return ltsGetPreallocBlock(lts, lt);
362 : : else
363 : 9406 : return ltsGetFreeBlock(lts);
364 : : }
365 : :
366 : : /*
367 : : * Select the lowest currently unused block from the tape set's global free
368 : : * list min heap.
369 : : */
370 : : static int64
8947 tgl@sss.pgh.pa.us 371 : 117646 : ltsGetFreeBlock(LogicalTapeSet *lts)
372 : : {
149 michael@paquier.xyz 373 :GNC 117646 : int64 *heap = lts->freeBlocks;
374 : : int64 blocknum;
375 : : int64 heapsize;
376 : : int64 holeval;
377 : : uint64 holepos;
378 : :
379 : : /* freelist empty; allocate a new block */
1529 jdavis@postgresql.or 380 [ + + ]:CBC 117646 : if (lts->nFreeBlocks == 0)
381 : 9757 : return lts->nBlocksAllocated++;
382 : :
383 : : /* easy if heap contains one element */
384 [ + + ]: 107889 : if (lts->nFreeBlocks == 1)
385 : : {
386 : 189 : lts->nFreeBlocks--;
387 : 189 : return lts->freeBlocks[0];
388 : : }
389 : :
390 : : /* remove top of minheap */
391 : 107700 : blocknum = heap[0];
392 : :
393 : : /* we'll replace it with end of minheap array */
852 tgl@sss.pgh.pa.us 394 : 107700 : holeval = heap[--lts->nFreeBlocks];
395 : :
396 : : /* sift down */
397 : 107700 : holepos = 0; /* holepos is where the "hole" is */
1529 jdavis@postgresql.or 398 : 107700 : heapsize = lts->nFreeBlocks;
399 : : while (true)
400 : 652416 : {
149 michael@paquier.xyz 401 :GNC 760116 : uint64 left = left_offset(holepos);
402 : 760116 : uint64 right = right_offset(holepos);
403 : : uint64 min_child;
404 : :
1529 jdavis@postgresql.or 405 [ + + + + ]:CBC 760116 : if (left < heapsize && right < heapsize)
406 [ + + ]: 658299 : min_child = (heap[left] < heap[right]) ? left : right;
407 [ + + ]: 101817 : else if (left < heapsize)
408 : 21846 : min_child = left;
409 [ - + ]: 79971 : else if (right < heapsize)
1529 jdavis@postgresql.or 410 :UBC 0 : min_child = right;
411 : : else
1529 jdavis@postgresql.or 412 :CBC 79971 : break;
413 : :
852 tgl@sss.pgh.pa.us 414 [ + + ]: 680145 : if (heap[min_child] >= holeval)
1529 jdavis@postgresql.or 415 : 27729 : break;
416 : :
852 tgl@sss.pgh.pa.us 417 : 652416 : heap[holepos] = heap[min_child];
418 : 652416 : holepos = min_child;
419 : : }
420 : 107700 : heap[holepos] = holeval;
421 : :
1529 jdavis@postgresql.or 422 : 107700 : return blocknum;
423 : : }
424 : :
425 : : /*
426 : : * Return the lowest free block number from the tape's preallocation list.
427 : : * Refill the preallocation list with blocks from the tape set's free list if
428 : : * necessary.
429 : : */
430 : : static int64
1419 431 : 14037 : ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
432 : : {
433 : : /* sorted in descending order, so return the last element */
434 [ + + ]: 14037 : if (lt->nprealloc > 0)
435 : 519 : return lt->prealloc[--lt->nprealloc];
436 : :
437 [ + + ]: 13518 : if (lt->prealloc == NULL)
438 : : {
439 : 13506 : lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
149 michael@paquier.xyz 440 :GNC 13506 : lt->prealloc = (int64 *) palloc(sizeof(int64) * lt->prealloc_size);
441 : : }
1419 jdavis@postgresql.or 442 [ + - ]:CBC 12 : else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX)
443 : : {
444 : : /* when the preallocation list runs out, double the size */
445 : 12 : lt->prealloc_size *= 2;
446 [ - + ]: 12 : if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX)
1419 jdavis@postgresql.or 447 :UBC 0 : lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX;
149 michael@paquier.xyz 448 :GNC 12 : lt->prealloc = (int64 *) repalloc(lt->prealloc,
449 : 12 : sizeof(int64) * lt->prealloc_size);
450 : : }
451 : :
452 : : /* refill preallocation list */
1419 jdavis@postgresql.or 453 :CBC 13518 : lt->nprealloc = lt->prealloc_size;
454 [ + + ]: 121758 : for (int i = lt->nprealloc; i > 0; i--)
455 : : {
456 : 108240 : lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
457 : :
458 : : /* verify descending order */
459 [ + + - + ]: 108240 : Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]);
460 : : }
461 : :
462 : 13518 : return lt->prealloc[--lt->nprealloc];
463 : : }
464 : :
465 : : /*
466 : : * Return a block# to the freelist.
467 : : */
468 : : static void
149 michael@paquier.xyz 469 :GNC 116880 : ltsReleaseBlock(LogicalTapeSet *lts, int64 blocknum)
470 : : {
471 : : int64 *heap;
472 : : uint64 holepos;
473 : :
474 : : /*
475 : : * Do nothing if we're no longer interested in remembering free space.
476 : : */
6613 tgl@sss.pgh.pa.us 477 [ + + ]:CBC 116880 : if (lts->forgetFreeSpace)
478 : 6822 : return;
479 : :
480 : : /*
481 : : * Enlarge freeBlocks array if full.
482 : : */
8947 483 [ + + ]: 110058 : if (lts->nFreeBlocks >= lts->freeBlocksLen)
484 : : {
485 : : /*
486 : : * If the freelist becomes very large, just return and leak this free
487 : : * block.
488 : : */
149 michael@paquier.xyz 489 [ - + ]:GNC 30 : if (lts->freeBlocksLen * 2 * sizeof(int64) > MaxAllocSize)
1529 jdavis@postgresql.or 490 :UBC 0 : return;
491 : :
8947 tgl@sss.pgh.pa.us 492 :CBC 30 : lts->freeBlocksLen *= 2;
149 michael@paquier.xyz 493 :GNC 30 : lts->freeBlocks = (int64 *) repalloc(lts->freeBlocks,
494 : 30 : lts->freeBlocksLen * sizeof(int64));
495 : : }
496 : :
497 : : /* create a "hole" at end of minheap array */
1529 jdavis@postgresql.or 498 :CBC 110058 : heap = lts->freeBlocks;
852 tgl@sss.pgh.pa.us 499 : 110058 : holepos = lts->nFreeBlocks;
1529 jdavis@postgresql.or 500 : 110058 : lts->nFreeBlocks++;
501 : :
502 : : /* sift up to insert blocknum */
852 tgl@sss.pgh.pa.us 503 [ + + ]: 493578 : while (holepos != 0)
504 : : {
149 michael@paquier.xyz 505 :GNC 476838 : uint64 parent = parent_offset(holepos);
506 : :
852 tgl@sss.pgh.pa.us 507 [ + + ]:CBC 476838 : if (heap[parent] < blocknum)
1529 jdavis@postgresql.or 508 : 93318 : break;
509 : :
852 tgl@sss.pgh.pa.us 510 : 383520 : heap[holepos] = heap[parent];
511 : 383520 : holepos = parent;
512 : : }
513 : 110058 : heap[holepos] = blocknum;
514 : : }
515 : :
516 : : /*
517 : : * Lazily allocate and initialize the read buffer. This avoids waste when many
518 : : * tapes are open at once, but not all are active between rewinding and
519 : : * reading.
520 : : */
521 : : static void
909 heikki.linnakangas@i 522 : 13946 : ltsInitReadBuffer(LogicalTape *lt)
523 : : {
1517 jdavis@postgresql.or 524 [ - + ]: 13946 : Assert(lt->buffer_size > 0);
525 : 13946 : lt->buffer = palloc(lt->buffer_size);
526 : :
527 : : /* Read the first block, or reset if tape is empty */
1522 528 : 13946 : lt->nextBlockNumber = lt->firstBlockNumber;
529 : 13946 : lt->pos = 0;
530 : 13946 : lt->nbytes = 0;
909 heikki.linnakangas@i 531 : 13946 : ltsReadFillBuffer(lt);
1522 jdavis@postgresql.or 532 : 13946 : }
533 : :
534 : : /*
535 : : * Create a tape set, backed by a temporary underlying file.
536 : : *
537 : : * The tape set is initially empty. Use LogicalTapeCreate() to create
538 : : * tapes in it.
539 : : *
540 : : * In a single-process sort, pass NULL argument for fileset, and -1 for
541 : : * worker.
542 : : *
543 : : * In a parallel sort, parallel workers pass the shared fileset handle and
544 : : * their own worker number. After the workers have finished, create the
545 : : * tape set in the leader, passing the shared fileset handle and -1 for
546 : : * worker, and use LogicalTapeImport() to import the worker tapes into it.
547 : : *
548 : : * Currently, the leader will only import worker tapes into the set, it does
549 : : * not create tapes of its own, although in principle that should work.
550 : : *
551 : : * If preallocate is true, blocks for each individual tape are allocated in
552 : : * batches. This avoids fragmentation when writing multiple tapes at the
553 : : * same time.
554 : : */
555 : : LogicalTapeSet *
909 heikki.linnakangas@i 556 : 372 : LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
557 : : {
558 : : LogicalTapeSet *lts;
559 : :
560 : : /*
561 : : * Create top-level struct including per-tape LogicalTape structs.
562 : : */
1500 jdavis@postgresql.or 563 : 372 : lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet));
2629 heikki.linnakangas@i 564 : 372 : lts->nBlocksAllocated = 0L;
565 : 372 : lts->nBlocksWritten = 0L;
2263 rhaas@postgresql.org 566 : 372 : lts->nHoleBlocks = 0L;
6613 tgl@sss.pgh.pa.us 567 : 372 : lts->forgetFreeSpace = false;
8947 568 : 372 : lts->freeBlocksLen = 32; /* reasonable initial guess */
149 michael@paquier.xyz 569 :GNC 372 : lts->freeBlocks = (int64 *) palloc(lts->freeBlocksLen * sizeof(int64));
8947 tgl@sss.pgh.pa.us 570 :CBC 372 : lts->nFreeBlocks = 0;
1311 jdavis@postgresql.or 571 : 372 : lts->enable_prealloc = preallocate;
572 : :
909 heikki.linnakangas@i 573 : 372 : lts->fileset = fileset;
574 : 372 : lts->worker = worker;
575 : :
576 : : /*
577 : : * Create temp BufFile storage as required.
578 : : *
579 : : * In leader, we hijack the BufFile of the first tape that's imported, and
580 : : * concatenate the BufFiles of any subsequent tapes to that. Hence don't
581 : : * create a BufFile here. Things are simpler for the worker case and the
582 : : * serial case, though. They are generally very similar -- workers use a
583 : : * shared fileset, whereas serial sorts use a conventional serial BufFile.
584 : : */
585 [ + + + + ]: 372 : if (fileset && worker == -1)
586 : 72 : lts->pfile = NULL;
2263 rhaas@postgresql.org 587 [ + + ]: 300 : else if (fileset)
588 : : {
589 : : char filename[MAXPGPATH];
590 : :
591 : 210 : pg_itoa(worker, filename);
958 akapila@postgresql.o 592 : 210 : lts->pfile = BufFileCreateFileSet(&fileset->fs, filename);
593 : : }
594 : : else
2263 rhaas@postgresql.org 595 : 90 : lts->pfile = BufFileCreateTemp(false);
596 : :
8947 tgl@sss.pgh.pa.us 597 : 372 : return lts;
598 : : }
599 : :
600 : : /*
601 : : * Claim ownership of a logical tape from an existing shared BufFile.
602 : : *
603 : : * Caller should be leader process. Though tapes are marked as frozen in
604 : : * workers, they are not frozen when opened within leader, since unfrozen tapes
605 : : * use a larger read buffer. (Frozen tapes have smaller read buffer, optimized
606 : : * for random access.)
607 : : */
608 : : LogicalTape *
909 heikki.linnakangas@i 609 : 146 : LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
610 : : {
611 : : LogicalTape *lt;
612 : : int64 tapeblocks;
613 : : char filename[MAXPGPATH];
614 : : BufFile *file;
615 : : int64 filesize;
616 : :
617 : 146 : lt = ltsCreateTape(lts);
618 : :
619 : : /*
620 : : * build concatenated view of all buffiles, remembering the block number
621 : : * where each source file begins.
622 : : */
623 : 146 : pg_itoa(worker, filename);
624 : 146 : file = BufFileOpenFileSet(<s->fileset->fs, filename, O_RDONLY, false);
625 : 146 : filesize = BufFileSize(file);
626 : :
627 : : /*
628 : : * Stash first BufFile, and concatenate subsequent BufFiles to that. Store
629 : : * block offset into each tape as we go.
630 : : */
631 : 146 : lt->firstBlockNumber = shared->firstblocknumber;
632 [ + + ]: 146 : if (lts->pfile == NULL)
633 : : {
634 : 72 : lts->pfile = file;
635 : 72 : lt->offsetBlockNumber = 0L;
636 : : }
637 : : else
638 : : {
639 : 74 : lt->offsetBlockNumber = BufFileAppend(lts->pfile, file);
640 : : }
641 : : /* Don't allocate more for read buffer than could possibly help */
642 : 146 : lt->max_size = Min(MaxAllocSize, filesize);
643 : 146 : tapeblocks = filesize / BLCKSZ;
644 : :
645 : : /*
646 : : * Update # of allocated blocks and # blocks written to reflect the
647 : : * imported BufFile. Allocated/written blocks include space used by holes
648 : : * left between concatenated BufFiles. Also track the number of hole
649 : : * blocks so that we can later work backwards to calculate the number of
650 : : * physical blocks for instrumentation.
651 : : */
652 : 146 : lts->nHoleBlocks += lt->offsetBlockNumber - lts->nBlocksAllocated;
653 : :
654 : 146 : lts->nBlocksAllocated = lt->offsetBlockNumber + tapeblocks;
655 : 146 : lts->nBlocksWritten = lts->nBlocksAllocated;
656 : :
657 : 146 : return lt;
658 : : }
659 : :
660 : : /*
661 : : * Close a logical tape set and release all resources.
662 : : *
663 : : * NOTE: This doesn't close any of the tapes! You must close them
664 : : * first, or you can let them be destroyed along with the memory context.
665 : : */
666 : : void
667 : 372 : LogicalTapeSetClose(LogicalTapeSet *lts)
668 : : {
669 : 372 : BufFileClose(lts->pfile);
8947 tgl@sss.pgh.pa.us 670 : 372 : pfree(lts->freeBlocks);
671 : 372 : pfree(lts);
672 : 372 : }
673 : :
674 : : /*
675 : : * Create a logical tape in the given tapeset.
676 : : *
677 : : * The tape is initialized in write state.
678 : : */
679 : : LogicalTape *
909 heikki.linnakangas@i 680 : 25782 : LogicalTapeCreate(LogicalTapeSet *lts)
681 : : {
682 : : /*
683 : : * The only thing that currently prevents creating new tapes in leader is
684 : : * the fact that BufFiles opened using BufFileOpenShared() are read-only
685 : : * by definition, but that could be changed if it seemed worthwhile. For
686 : : * now, writing to the leader tape will raise a "Bad file descriptor"
687 : : * error, so tuplesort must avoid writing to the leader tape altogether.
688 : : */
689 [ + + - + ]: 25782 : if (lts->fileset && lts->worker == -1)
909 heikki.linnakangas@i 690 [ # # ]:UBC 0 : elog(ERROR, "cannot create new tapes in leader process");
691 : :
909 heikki.linnakangas@i 692 :CBC 25782 : return ltsCreateTape(lts);
693 : : }
694 : :
695 : : static LogicalTape *
696 : 25928 : ltsCreateTape(LogicalTapeSet *lts)
697 : : {
698 : : LogicalTape *lt;
699 : :
700 : : /*
701 : : * Create per-tape struct. Note we allocate the I/O buffer lazily.
702 : : */
703 : 25928 : lt = palloc(sizeof(LogicalTape));
704 : 25928 : lt->tapeSet = lts;
705 : 25928 : lt->writing = true;
706 : 25928 : lt->frozen = false;
707 : 25928 : lt->dirty = false;
708 : 25928 : lt->firstBlockNumber = -1L;
709 : 25928 : lt->curBlockNumber = -1L;
710 : 25928 : lt->nextBlockNumber = -1L;
711 : 25928 : lt->offsetBlockNumber = 0L;
712 : 25928 : lt->buffer = NULL;
713 : 25928 : lt->buffer_size = 0;
714 : : /* palloc() larger than MaxAllocSize would fail */
715 : 25928 : lt->max_size = MaxAllocSize;
716 : 25928 : lt->pos = 0;
717 : 25928 : lt->nbytes = 0;
718 : 25928 : lt->prealloc = NULL;
719 : 25928 : lt->nprealloc = 0;
720 : 25928 : lt->prealloc_size = 0;
721 : :
722 : 25928 : return lt;
723 : : }
724 : :
725 : : /*
726 : : * Close a logical tape.
727 : : *
728 : : * Note: This doesn't return any blocks to the free list! You must read
729 : : * the tape to the end first, to reuse the space. In current use, though,
730 : : * we only close tapes after fully reading them.
731 : : */
732 : : void
733 : 13811 : LogicalTapeClose(LogicalTape *lt)
734 : : {
735 [ + - ]: 13811 : if (lt->buffer)
736 : 13811 : pfree(lt->buffer);
737 : 13811 : pfree(lt);
738 : 13811 : }
739 : :
740 : : /*
741 : : * Mark a logical tape set as not needing management of free space anymore.
742 : : *
743 : : * This should be called if the caller does not intend to write any more data
744 : : * into the tape set, but is reading from un-frozen tapes. Since no more
745 : : * writes are planned, remembering free blocks is no longer useful. Setting
746 : : * this flag lets us avoid wasting time and space in ltsReleaseBlock(), which
747 : : * is not designed to handle large numbers of free blocks.
748 : : */
749 : : void
6613 tgl@sss.pgh.pa.us 750 : 120 : LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
751 : : {
752 : 120 : lts->forgetFreeSpace = true;
753 : 120 : }
754 : :
755 : : /*
756 : : * Write to a logical tape.
757 : : *
758 : : * There are no error returns; we ereport() on failure.
759 : : */
760 : : void
471 peter@eisentraut.org 761 : 6866833 : LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size)
762 : : {
909 heikki.linnakangas@i 763 : 6866833 : LogicalTapeSet *lts = lt->tapeSet;
764 : : size_t nthistime;
765 : :
8947 tgl@sss.pgh.pa.us 766 [ - + ]: 6866833 : Assert(lt->writing);
2263 rhaas@postgresql.org 767 [ - + ]: 6866833 : Assert(lt->offsetBlockNumber == 0L);
768 : :
769 : : /* Allocate data buffer and first block on first write */
6629 tgl@sss.pgh.pa.us 770 [ + + ]: 6866833 : if (lt->buffer == NULL)
771 : : {
772 : 14016 : lt->buffer = (char *) palloc(BLCKSZ);
2750 heikki.linnakangas@i 773 : 14016 : lt->buffer_size = BLCKSZ;
774 : : }
2670 775 [ + + ]: 6866833 : if (lt->curBlockNumber == -1)
776 : : {
777 [ - + ]: 14016 : Assert(lt->firstBlockNumber == -1);
778 [ - + ]: 14016 : Assert(lt->pos == 0);
779 : :
1311 jdavis@postgresql.or 780 : 14016 : lt->curBlockNumber = ltsGetBlock(lts, lt);
2670 heikki.linnakangas@i 781 : 14016 : lt->firstBlockNumber = lt->curBlockNumber;
782 : :
783 : 14016 : TapeBlockGetTrailer(lt->buffer)->prev = -1L;
784 : : }
785 : :
2750 786 [ - + ]: 6866833 : Assert(lt->buffer_size == BLCKSZ);
8947 tgl@sss.pgh.pa.us 787 [ + + ]: 13739989 : while (size > 0)
788 : : {
1407 jdavis@postgresql.or 789 [ + + ]: 6873156 : if (lt->pos >= (int) TapeBlockPayloadSize)
790 : : {
791 : : /* Buffer full, dump it out */
792 : : int64 nextBlockNumber;
793 : :
2670 heikki.linnakangas@i 794 [ - + ]: 9427 : if (!lt->dirty)
795 : : {
796 : : /* Hmm, went directly from reading to writing? */
7569 tgl@sss.pgh.pa.us 797 [ # # ]:UBC 0 : elog(ERROR, "invalid logtape state: should be dirty");
798 : : }
799 : :
800 : : /*
801 : : * First allocate the next block, so that we can store it in the
802 : : * 'next' pointer of this block.
803 : : */
909 heikki.linnakangas@i 804 :CBC 9427 : nextBlockNumber = ltsGetBlock(lt->tapeSet, lt);
805 : :
806 : : /* set the next-pointer and dump the current block. */
2670 807 : 9427 : TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
471 peter@eisentraut.org 808 : 9427 : ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, lt->buffer);
809 : :
810 : : /* initialize the prev-pointer of the next block */
2670 heikki.linnakangas@i 811 : 9427 : TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
812 : 9427 : lt->curBlockNumber = nextBlockNumber;
8947 tgl@sss.pgh.pa.us 813 : 9427 : lt->pos = 0;
814 : 9427 : lt->nbytes = 0;
815 : : }
816 : :
2670 heikki.linnakangas@i 817 : 6873156 : nthistime = TapeBlockPayloadSize - lt->pos;
8947 tgl@sss.pgh.pa.us 818 [ + + ]: 6873156 : if (nthistime > size)
819 : 6863729 : nthistime = size;
820 [ - + ]: 6873156 : Assert(nthistime > 0);
821 : :
822 : 6873156 : memcpy(lt->buffer + lt->pos, ptr, nthistime);
823 : :
824 : 6873156 : lt->dirty = true;
825 : 6873156 : lt->pos += nthistime;
826 [ + - ]: 6873156 : if (lt->nbytes < lt->pos)
827 : 6873156 : lt->nbytes = lt->pos;
471 peter@eisentraut.org 828 : 6873156 : ptr = (const char *) ptr + nthistime;
8947 tgl@sss.pgh.pa.us 829 : 6873156 : size -= nthistime;
830 : : }
831 : 6866833 : }
832 : :
833 : : /*
834 : : * Rewind logical tape and switch from writing to reading.
835 : : *
836 : : * The tape must currently be in writing state, or "frozen" in read state.
837 : : *
838 : : * 'buffer_size' specifies how much memory to use for the read buffer.
839 : : * Regardless of the argument, the actual amount of memory used is between
840 : : * BLCKSZ and MaxAllocSize, and is a multiple of BLCKSZ. The given value is
841 : : * rounded down and truncated to fit those constraints, if necessary. If the
842 : : * tape is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ
843 : : * byte buffer is used.
844 : : */
845 : : void
909 heikki.linnakangas@i 846 : 13946 : LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
847 : : {
848 : 13946 : LogicalTapeSet *lts = lt->tapeSet;
849 : :
850 : : /*
851 : : * Round and cap buffer_size if needed.
852 : : */
2741 853 [ + + ]: 13946 : if (lt->frozen)
854 : 3 : buffer_size = BLCKSZ;
855 : : else
856 : : {
857 : : /* need at least one block */
858 [ + + ]: 13943 : if (buffer_size < BLCKSZ)
859 : 90 : buffer_size = BLCKSZ;
860 : :
861 : : /* palloc() larger than max_size is unlikely to be helpful */
2263 rhaas@postgresql.org 862 [ + + ]: 13943 : if (buffer_size > lt->max_size)
863 : 146 : buffer_size = lt->max_size;
864 : :
865 : : /* round down to BLCKSZ boundary */
2741 heikki.linnakangas@i 866 : 13943 : buffer_size -= buffer_size % BLCKSZ;
867 : : }
868 : :
869 [ + + ]: 13946 : if (lt->writing)
870 : : {
871 : : /*
872 : : * Completion of a write phase. Flush last partial data block, and
873 : : * rewind for normal (destructive) read.
874 : : */
875 [ + + ]: 13943 : if (lt->dirty)
876 : : {
877 : : /*
878 : : * As long as we've filled the buffer at least once, its contents
879 : : * are entirely defined from valgrind's point of view, even though
880 : : * contents beyond the current end point may be stale. But it's
881 : : * possible - at least in the case of a parallel sort - to sort
882 : : * such small amount of data that we do not fill the buffer even
883 : : * once. Tell valgrind that its contents are defined, so it
884 : : * doesn't bleat.
885 : : */
886 : : VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes,
887 : : lt->buffer_size - lt->nbytes);
888 : :
2670 889 : 13797 : TapeBlockSetNBytes(lt->buffer, lt->nbytes);
471 peter@eisentraut.org 890 : 13797 : ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, lt->buffer);
891 : : }
2741 heikki.linnakangas@i 892 : 13943 : lt->writing = false;
893 : : }
894 : : else
895 : : {
896 : : /*
897 : : * This is only OK if tape is frozen; we rewind for (another) read
898 : : * pass.
899 : : */
900 [ - + ]: 3 : Assert(lt->frozen);
901 : : }
902 : :
903 [ + + ]: 13946 : if (lt->buffer)
904 : 13800 : pfree(lt->buffer);
905 : :
906 : : /* the buffer is lazily allocated, but set the size here */
907 : 13946 : lt->buffer = NULL;
1517 jdavis@postgresql.or 908 : 13946 : lt->buffer_size = buffer_size;
909 : :
910 : : /* free the preallocation list, and return unused block numbers */
1419 911 [ + + ]: 13946 : if (lt->prealloc != NULL)
912 : : {
913 [ + + ]: 107709 : for (int i = lt->nprealloc; i > 0; i--)
914 : 94203 : ltsReleaseBlock(lts, lt->prealloc[i - 1]);
915 : 13506 : pfree(lt->prealloc);
916 : 13506 : lt->prealloc = NULL;
917 : 13506 : lt->nprealloc = 0;
918 : 13506 : lt->prealloc_size = 0;
919 : : }
2741 heikki.linnakangas@i 920 : 13946 : }
921 : :
922 : : /*
923 : : * Read from a logical tape.
924 : : *
925 : : * Early EOF is indicated by return value less than #bytes requested.
926 : : */
927 : : size_t
909 928 : 7004328 : LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
929 : : {
8768 bruce@momjian.us 930 : 7004328 : size_t nread = 0;
931 : : size_t nthistime;
932 : :
933 [ - + ]: 7004328 : Assert(!lt->writing);
934 : :
1522 jdavis@postgresql.or 935 [ + + ]: 7004328 : if (lt->buffer == NULL)
909 heikki.linnakangas@i 936 : 13946 : ltsInitReadBuffer(lt);
937 : :
8947 tgl@sss.pgh.pa.us 938 [ + + ]: 13997046 : while (size > 0)
939 : : {
940 [ + + ]: 7006224 : if (lt->pos >= lt->nbytes)
941 : : {
942 : : /* Try to load more data into buffer. */
909 heikki.linnakangas@i 943 [ + + ]: 16461 : if (!ltsReadFillBuffer(lt))
8947 tgl@sss.pgh.pa.us 944 : 13506 : break; /* EOF */
945 : : }
946 : :
947 : 6992718 : nthistime = lt->nbytes - lt->pos;
948 [ + + ]: 6992718 : if (nthistime > size)
949 : 6975832 : nthistime = size;
950 [ - + ]: 6992718 : Assert(nthistime > 0);
951 : :
952 : 6992718 : memcpy(ptr, lt->buffer + lt->pos, nthistime);
953 : :
954 : 6992718 : lt->pos += nthistime;
471 peter@eisentraut.org 955 : 6992718 : ptr = (char *) ptr + nthistime;
8947 tgl@sss.pgh.pa.us 956 : 6992718 : size -= nthistime;
957 : 6992718 : nread += nthistime;
958 : : }
959 : :
960 : 7004328 : return nread;
961 : : }
962 : :
963 : : /*
964 : : * "Freeze" the contents of a tape so that it can be read multiple times
965 : : * and/or read backwards. Once a tape is frozen, its contents will not
966 : : * be released until the LogicalTapeSet is destroyed. This is expected
967 : : * to be used only for the final output pass of a merge.
968 : : *
969 : : * This *must* be called just at the end of a write pass, before the
970 : : * tape is rewound (after rewind is too late!). It performs a rewind
971 : : * and switch to read mode "for free". An immediately following rewind-
972 : : * for-read call is OK but not necessary.
973 : : *
974 : : * share output argument is set with details of storage used for tape after
975 : : * freezing, which may be passed to LogicalTapeSetCreate within leader
976 : : * process later. This metadata is only of interest to worker callers
977 : : * freezing their final output for leader (single materialized tape).
978 : : * Serial sorts should set share to NULL.
979 : : */
980 : : void
909 heikki.linnakangas@i 981 : 219 : LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
982 : : {
983 : 219 : LogicalTapeSet *lts = lt->tapeSet;
984 : :
8947 tgl@sss.pgh.pa.us 985 [ - + ]: 219 : Assert(lt->writing);
2263 rhaas@postgresql.org 986 [ - + ]: 219 : Assert(lt->offsetBlockNumber == 0L);
987 : :
988 : : /*
989 : : * Completion of a write phase. Flush last partial data block, and rewind
990 : : * for nondestructive read.
991 : : */
8947 tgl@sss.pgh.pa.us 992 [ + - ]: 219 : if (lt->dirty)
993 : : {
994 : : /*
995 : : * As long as we've filled the buffer at least once, its contents are
996 : : * entirely defined from valgrind's point of view, even though
997 : : * contents beyond the current end point may be stale. But it's
998 : : * possible - at least in the case of a parallel sort - to sort such
999 : : * small amount of data that we do not fill the buffer even once. Tell
1000 : : * valgrind that its contents are defined, so it doesn't bleat.
1001 : : */
1002 : : VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes,
1003 : : lt->buffer_size - lt->nbytes);
1004 : :
2670 heikki.linnakangas@i 1005 : 219 : TapeBlockSetNBytes(lt->buffer, lt->nbytes);
471 peter@eisentraut.org 1006 : 219 : ltsWriteBlock(lt->tapeSet, lt->curBlockNumber, lt->buffer);
1007 : : }
8947 tgl@sss.pgh.pa.us 1008 : 219 : lt->writing = false;
1009 : 219 : lt->frozen = true;
1010 : :
1011 : : /*
1012 : : * The seek and backspace functions assume a single block read buffer.
1013 : : * That's OK with current usage. A larger buffer is helpful to make the
1014 : : * read pattern of the backing file look more sequential to the OS, when
1015 : : * we're reading from multiple tapes. But at the end of a sort, when a
1016 : : * tape is frozen, we only read from a single tape anyway.
1017 : : */
2750 heikki.linnakangas@i 1018 [ + - - + ]: 219 : if (!lt->buffer || lt->buffer_size != BLCKSZ)
1019 : : {
2750 heikki.linnakangas@i 1020 [ # # ]:UBC 0 : if (lt->buffer)
1021 : 0 : pfree(lt->buffer);
1022 : 0 : lt->buffer = palloc(BLCKSZ);
1023 : 0 : lt->buffer_size = BLCKSZ;
1024 : : }
1025 : :
1026 : : /* Read the first block, or reset if tape is empty */
2670 heikki.linnakangas@i 1027 :CBC 219 : lt->curBlockNumber = lt->firstBlockNumber;
8947 tgl@sss.pgh.pa.us 1028 : 219 : lt->pos = 0;
1029 : 219 : lt->nbytes = 0;
1030 : :
2670 heikki.linnakangas@i 1031 [ - + ]: 219 : if (lt->firstBlockNumber == -1L)
2670 heikki.linnakangas@i 1032 :UBC 0 : lt->nextBlockNumber = -1L;
471 peter@eisentraut.org 1033 :CBC 219 : ltsReadBlock(lt->tapeSet, lt->curBlockNumber, lt->buffer);
2670 heikki.linnakangas@i 1034 [ + + ]: 219 : if (TapeBlockIsLast(lt->buffer))
1035 : 182 : lt->nextBlockNumber = -1L;
1036 : : else
1037 : 37 : lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
1038 [ + + ]: 219 : lt->nbytes = TapeBlockGetNBytes(lt->buffer);
1039 : :
1040 : : /* Handle extra steps when caller is to share its tapeset */
2263 rhaas@postgresql.org 1041 [ + + ]: 219 : if (share)
1042 : : {
958 akapila@postgresql.o 1043 : 210 : BufFileExportFileSet(lts->pfile);
2263 rhaas@postgresql.org 1044 : 210 : share->firstblocknumber = lt->firstBlockNumber;
1045 : : }
8947 tgl@sss.pgh.pa.us 1046 : 219 : }
1047 : :
1048 : : /*
1049 : : * Backspace the tape a given number of bytes. (We also support a more
1050 : : * general seek interface, see below.)
1051 : : *
1052 : : * *Only* a frozen-for-read tape can be backed up; we don't support
1053 : : * random access during write, and an unfrozen read tape may have
1054 : : * already discarded the desired data!
1055 : : *
1056 : : * Returns the number of bytes backed up. It can be less than the
1057 : : * requested amount, if there isn't that much data before the current
1058 : : * position. The tape is positioned to the beginning of the tape in
1059 : : * that case.
1060 : : */
1061 : : size_t
909 heikki.linnakangas@i 1062 : 36 : LogicalTapeBackspace(LogicalTape *lt, size_t size)
1063 : : {
2670 1064 : 36 : size_t seekpos = 0;
1065 : :
8947 tgl@sss.pgh.pa.us 1066 [ - + ]: 36 : Assert(lt->frozen);
2750 heikki.linnakangas@i 1067 [ - + ]: 36 : Assert(lt->buffer_size == BLCKSZ);
1068 : :
1522 jdavis@postgresql.or 1069 [ - + ]: 36 : if (lt->buffer == NULL)
909 heikki.linnakangas@i 1070 :UBC 0 : ltsInitReadBuffer(lt);
1071 : :
1072 : : /*
1073 : : * Easy case for seek within current block.
1074 : : */
8947 tgl@sss.pgh.pa.us 1075 [ + + ]:CBC 36 : if (size <= (size_t) lt->pos)
1076 : : {
1077 : 33 : lt->pos -= (int) size;
2670 heikki.linnakangas@i 1078 : 33 : return size;
1079 : : }
1080 : :
1081 : : /*
1082 : : * Not-so-easy case, have to walk back the chain of blocks. This
1083 : : * implementation would be pretty inefficient for long seeks, but we
1084 : : * really aren't doing that (a seek over one tuple is typical).
1085 : : */
1086 : 3 : seekpos = (size_t) lt->pos; /* part within this block */
1087 [ + - ]: 3 : while (size > seekpos)
1088 : : {
149 michael@paquier.xyz 1089 :GNC 3 : int64 prev = TapeBlockGetTrailer(lt->buffer)->prev;
1090 : :
2670 heikki.linnakangas@i 1091 [ + - ]:CBC 3 : if (prev == -1L)
1092 : : {
1093 : : /* Tried to back up beyond the beginning of tape. */
1094 [ - + ]: 3 : if (lt->curBlockNumber != lt->firstBlockNumber)
2670 heikki.linnakangas@i 1095 [ # # ]:UBC 0 : elog(ERROR, "unexpected end of tape");
2670 heikki.linnakangas@i 1096 :CBC 3 : lt->pos = 0;
1097 : 3 : return seekpos;
1098 : : }
1099 : :
471 peter@eisentraut.org 1100 :UBC 0 : ltsReadBlock(lt->tapeSet, prev, lt->buffer);
1101 : :
2670 heikki.linnakangas@i 1102 [ # # ]: 0 : if (TapeBlockGetTrailer(lt->buffer)->next != lt->curBlockNumber)
149 michael@paquier.xyz 1103 [ # # ]:UNC 0 : elog(ERROR, "broken tape, next of block %lld is %lld, expected %lld",
1104 : : (long long) prev,
1105 : : (long long) (TapeBlockGetTrailer(lt->buffer)->next),
1106 : : (long long) lt->curBlockNumber);
1107 : :
2670 heikki.linnakangas@i 1108 :UBC 0 : lt->nbytes = TapeBlockPayloadSize;
1109 : 0 : lt->curBlockNumber = prev;
1110 : 0 : lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
1111 : :
1112 : 0 : seekpos += TapeBlockPayloadSize;
1113 : : }
1114 : :
1115 : : /*
1116 : : * 'seekpos' can now be greater than 'size', because it points to the
1117 : : * beginning the target block. The difference is the position within the
1118 : : * page.
1119 : : */
1120 : 0 : lt->pos = seekpos - size;
1121 : 0 : return size;
1122 : : }
1123 : :
1124 : : /*
1125 : : * Seek to an arbitrary position in a logical tape.
1126 : : *
1127 : : * *Only* a frozen-for-read tape can be seeked.
1128 : : *
1129 : : * Must be called with a block/offset previously returned by
1130 : : * LogicalTapeTell().
1131 : : */
1132 : : void
149 michael@paquier.xyz 1133 :GNC 3096 : LogicalTapeSeek(LogicalTape *lt, int64 blocknum, int offset)
1134 : : {
8947 tgl@sss.pgh.pa.us 1135 [ - + ]:CBC 3096 : Assert(lt->frozen);
2670 heikki.linnakangas@i 1136 [ + - - + ]: 3096 : Assert(offset >= 0 && offset <= TapeBlockPayloadSize);
2750 1137 [ - + ]: 3096 : Assert(lt->buffer_size == BLCKSZ);
1138 : :
1522 jdavis@postgresql.or 1139 [ - + ]: 3096 : if (lt->buffer == NULL)
909 heikki.linnakangas@i 1140 :UBC 0 : ltsInitReadBuffer(lt);
1141 : :
2670 heikki.linnakangas@i 1142 [ + + ]:CBC 3096 : if (blocknum != lt->curBlockNumber)
1143 : : {
471 peter@eisentraut.org 1144 : 39 : ltsReadBlock(lt->tapeSet, blocknum, lt->buffer);
2670 heikki.linnakangas@i 1145 : 39 : lt->curBlockNumber = blocknum;
1146 : 39 : lt->nbytes = TapeBlockPayloadSize;
1147 : 39 : lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
1148 : : }
1149 : :
1150 [ - + ]: 3096 : if (offset > lt->nbytes)
2670 heikki.linnakangas@i 1151 [ # # ]:UBC 0 : elog(ERROR, "invalid tape seek position");
8947 tgl@sss.pgh.pa.us 1152 :CBC 3096 : lt->pos = offset;
1153 : 3096 : }
1154 : :
1155 : : /*
1156 : : * Obtain current position in a form suitable for a later LogicalTapeSeek.
1157 : : *
1158 : : * NOTE: it'd be OK to do this during write phase with intention of using
1159 : : * the position for a seek after freezing. Not clear if anyone needs that.
1160 : : */
1161 : : void
149 michael@paquier.xyz 1162 :GNC 4404 : LogicalTapeTell(LogicalTape *lt, int64 *blocknum, int *offset)
1163 : : {
1522 jdavis@postgresql.or 1164 [ - + ]:CBC 4404 : if (lt->buffer == NULL)
909 heikki.linnakangas@i 1165 :UBC 0 : ltsInitReadBuffer(lt);
1166 : :
2263 rhaas@postgresql.org 1167 [ - + ]:CBC 4404 : Assert(lt->offsetBlockNumber == 0L);
1168 : :
1169 : : /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
2750 heikki.linnakangas@i 1170 [ - + ]: 4404 : Assert(lt->buffer_size == BLCKSZ);
1171 : :
8947 tgl@sss.pgh.pa.us 1172 : 4404 : *blocknum = lt->curBlockNumber;
1173 : 4404 : *offset = lt->pos;
1174 : 4404 : }
1175 : :
1176 : : /*
1177 : : * Obtain total disk space currently used by a LogicalTapeSet, in blocks. Does
1178 : : * not account for open write buffer, if any.
1179 : : */
1180 : : int64
6753 1181 : 13878 : LogicalTapeSetBlocks(LogicalTapeSet *lts)
1182 : : {
1307 jdavis@postgresql.or 1183 : 13878 : return lts->nBlocksWritten - lts->nHoleBlocks;
1184 : : }
|