Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * md.c
4 : * This code manages relations that reside on magnetic disk.
5 : *
6 : * Or at least, that was what the Berkeley folk had in mind when they named
7 : * this file. In reality, what this code provides is an interface from
8 : * the smgr API to Unix-like filesystem APIs, so it will work with any type
9 : * of device for which the operating system provides filesystem support.
10 : * It doesn't matter whether the bits are on spinning rust or some other
11 : * storage technology.
12 : *
13 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : *
17 : * IDENTIFICATION
18 : * src/backend/storage/smgr/md.c
19 : *
20 : *-------------------------------------------------------------------------
21 : */
22 : #include "postgres.h"
23 :
24 : #include <unistd.h>
25 : #include <fcntl.h>
26 : #include <sys/file.h>
27 :
28 : #include "access/xlog.h"
29 : #include "access/xlogutils.h"
30 : #include "commands/tablespace.h"
31 : #include "miscadmin.h"
32 : #include "pg_trace.h"
33 : #include "pgstat.h"
34 : #include "postmaster/bgwriter.h"
35 : #include "storage/bufmgr.h"
36 : #include "storage/fd.h"
37 : #include "storage/md.h"
38 : #include "storage/relfilelocator.h"
39 : #include "storage/smgr.h"
40 : #include "storage/sync.h"
41 : #include "utils/hsearch.h"
42 : #include "utils/memutils.h"
43 :
44 : /*
45 : * The magnetic disk storage manager keeps track of open file
46 : * descriptors in its own descriptor pool. This is done to make it
47 : * easier to support relations that are larger than the operating
48 : * system's file size limit (often 2GBytes). In order to do that,
49 : * we break relations up into "segment" files that are each shorter than
50 : * the OS file size limit. The segment size is set by the RELSEG_SIZE
51 : * configuration constant in pg_config.h.
52 : *
53 : * On disk, a relation must consist of consecutively numbered segment
54 : * files in the pattern
55 : * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
56 : * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
57 : * -- Optionally, any number of inactive segments of size 0 blocks.
58 : * The full and partial segments are collectively the "active" segments.
59 : * Inactive segments are those that once contained data but are currently
60 : * not needed because of an mdtruncate() operation. The reason for leaving
61 : * them present at size zero, rather than unlinking them, is that other
62 : * backends and/or the checkpointer might be holding open file references to
63 : * such segments. If the relation expands again after mdtruncate(), such
64 : * that a deactivated segment becomes active again, it is important that
65 : * such file references still be valid --- else data might get written
66 : * out to an unlinked old copy of a segment file that will eventually
67 : * disappear.
68 : *
69 : * File descriptors are stored in the per-fork md_seg_fds arrays inside
70 : * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
71 : * Note that a fork's md_num_open_segs having a specific value does not
72 : * necessarily mean the relation doesn't have additional segments; we may
73 : * just not have opened the next segment yet. (We could not have "all
74 : * segments are in the array" as an invariant anyway, since another backend
75 : * could extend the relation while we aren't looking.) We do not have
76 : * entries for inactive segments, however; as soon as we find a partial
77 : * segment, we assume that any subsequent segments are inactive.
78 : *
79 : * The entire MdfdVec array is palloc'd in the MdCxt memory context.
80 : */
81 :
82 : typedef struct _MdfdVec
83 : {
84 : File mdfd_vfd; /* fd number in fd.c's pool */
85 : BlockNumber mdfd_segno; /* segment number, from 0 */
86 : } MdfdVec;
87 :
88 : static MemoryContext MdCxt; /* context for all MdfdVec objects */
89 :
90 :
91 : /* Populate a file tag describing an md.c segment file. */
92 : #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
93 : ( \
94 : memset(&(a), 0, sizeof(FileTag)), \
95 : (a).handler = SYNC_HANDLER_MD, \
96 : (a).rlocator = (xx_rlocator), \
97 : (a).forknum = (xx_forknum), \
98 : (a).segno = (xx_segno) \
99 : )
100 :
101 :
102 : /*** behavior for mdopen & _mdfd_getseg ***/
103 : /* ereport if segment not present */
104 : #define EXTENSION_FAIL (1 << 0)
105 : /* return NULL if segment not present */
106 : #define EXTENSION_RETURN_NULL (1 << 1)
107 : /* create new segments as needed */
108 : #define EXTENSION_CREATE (1 << 2)
109 : /* create new segments if needed during recovery */
110 : #define EXTENSION_CREATE_RECOVERY (1 << 3)
111 : /*
112 : * Allow opening segments which are preceded by segments smaller than
113 : * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks
114 : * mdnblocks() and related functionality henceforth - which currently is ok,
115 : * because this is only required in the checkpointer which never uses
116 : * mdnblocks().
117 : */
118 : #define EXTENSION_DONT_CHECK_SIZE (1 << 4)
119 : /* don't try to open a segment, if not already open */
120 : #define EXTENSION_DONT_OPEN (1 << 5)
121 :
122 :
123 : /* local routines */
124 : static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
125 : bool isRedo);
126 : static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
127 : static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
128 : MdfdVec *seg);
129 : static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
130 : BlockNumber segno);
131 : static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
132 : BlockNumber segno);
133 : static void _fdvec_resize(SMgrRelation reln,
134 : ForkNumber forknum,
135 : int nseg);
136 : static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
137 : BlockNumber segno);
138 : static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
139 : BlockNumber segno, int oflags);
140 : static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
141 : BlockNumber blkno, bool skipFsync, int behavior);
142 : static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
143 : MdfdVec *seg);
144 :
145 : static inline int
1 tmunro 146 GNC 1739246 : _mdfd_open_flags(void)
147 : {
148 1739246 : int flags = O_RDWR | PG_BINARY;
149 :
150 1739246 : if (io_direct_flags & IO_DIRECT_DATA)
151 304 : flags |= PG_O_DIRECT;
152 :
153 1739246 : return flags;
154 : }
155 :
9770 scrappy 156 ECB : /*
157 : * mdinit() -- Initialize private state for magnetic disk storage manager.
158 : */
159 : void
7956 tgl 160 CBC 13296 : mdinit(void)
9770 scrappy 161 ECB : {
8320 tgl 162 GIC 13296 : MdCxt = AllocSetContextCreate(TopMemoryContext,
8320 tgl 163 ECB : "MdSmgr",
164 : ALLOCSET_DEFAULT_SIZES);
5036 heikki.linnakangas 165 GIC 13296 : }
166 :
167 : /*
168 : * mdexists() -- Does the physical file exist?
169 : *
5354 heikki.linnakangas 170 ECB : * Note: this will return true for lingering files, with pending deletions
171 : */
172 : bool
202 pg 173 GNC 1061928 : mdexists(SMgrRelation reln, ForkNumber forknum)
174 : {
5354 heikki.linnakangas 175 ECB : /*
176 : * Close it first, to ensure that we notice if the fork has been unlinked
177 : * since we opened it. As an optimization, we can skip that in recovery,
178 : * which already closes relations when dropping them.
179 : */
367 tmunro 180 GIC 1061928 : if (!InRecovery)
202 pg 181 GNC 577827 : mdclose(reln, forknum);
182 :
183 1061928 : return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
184 : }
185 :
186 : /*
187 : * mdcreate() -- Create a new relation on magnetic disk.
188 : *
189 : * If isRedo is true, it's okay for the relation to exist already.
6998 tgl 190 ECB : */
5940 191 : void
202 pg 192 GNC 2721301 : mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
9770 scrappy 193 ECB : {
194 : MdfdVec *mdfd;
195 : char *path;
196 : File fd;
197 :
202 pg 198 GNC 2721301 : if (isRedo && reln->md_num_open_segs[forknum] > 0)
5940 tgl 199 GIC 2502428 : return; /* created and opened already... */
200 :
202 pg 201 GNC 218873 : Assert(reln->md_num_open_segs[forknum] == 0);
8329 tgl 202 ECB :
203 : /*
204 : * We may be using the target table space for the first time in this
205 : * database, so create a per-database subdirectory if needed.
206 : *
207 : * XXX this is a fairly ugly violation of module layering, but this seems
1362 tmunro 208 : * to be the best place to put the check. Maybe TablespaceCreateDbspace
209 : * should be here and not in commands/tablespace.c? But that would imply
210 : * importing a lot of stuff that smgr.c oughtn't know, either.
211 : */
277 rhaas 212 GNC 218873 : TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
213 : reln->smgr_rlocator.locator.dbOid,
214 : isRedo);
215 :
202 pg 216 218873 : path = relpath(reln->smgr_rlocator, forknum);
217 :
1 tmunro 218 218873 : fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
219 :
9345 bruce 220 GIC 218873 : if (fd < 0)
221 : {
8053 bruce 222 CBC 3824 : int save_errno = errno;
223 :
1532 akapila 224 GIC 3824 : if (isRedo)
1 tmunro 225 GNC 3824 : fd = PathNameOpenFile(path, _mdfd_open_flags());
9345 bruce 226 CBC 3824 : if (fd < 0)
227 : {
5940 tgl 228 ECB : /* be sure to report the error reported by create, not open */
8329 tgl 229 UIC 0 : errno = save_errno;
5940 tgl 230 LBC 0 : ereport(ERROR,
231 : (errcode_for_file_access(),
4995 heikki.linnakangas 232 ECB : errmsg("could not create file \"%s\": %m", path)));
233 : }
9345 bruce 234 : }
8187 tgl 235 :
8187 tgl 236 CBC 218873 : pfree(path);
237 :
202 pg 238 GNC 218873 : _fdvec_resize(reln, forknum, 1);
239 218873 : mdfd = &reln->md_seg_fds[forknum][0];
2404 andres 240 GBC 218873 : mdfd->mdfd_vfd = fd;
2404 andres 241 GIC 218873 : mdfd->mdfd_segno = 0;
242 : }
243 :
244 : /*
245 : * mdunlink() -- Unlink a relation.
6998 tgl 246 ECB : *
247 : * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
248 : * there won't be an SMgrRelation hashtable entry anymore.
249 : *
250 : * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
3916 251 : * to delete all forks.
252 : *
253 : * For regular relations, we don't unlink the first segment file of the rel,
254 : * but just truncate it to zero length, and record a request to unlink it after
255 : * the next checkpoint. Additional segments can be unlinked immediately,
256 : * however. Leaving the empty file in place prevents that relfilenumber
257 : * from being reused. The scenario this protects us from is:
258 : * 1. We delete a relation (and commit, and actually remove its file).
259 : * 2. We create a new relation, which by chance gets the same relfilenumber as
260 : * the just-deleted one (OIDs must've wrapped around for that to happen).
261 : * 3. We crash before another checkpoint occurs.
262 : * During replay, we would delete the file and then recreate it, which is fine
263 : * if the contents of the file were repopulated by subsequent WAL entries.
264 : * But if we didn't WAL-log insertions, but instead relied on fsyncing the
265 : * file after populating it (as we do at wal_level=minimal), the contents of
266 : * the file would be lost forever. By leaving the empty file until after the
267 : * next checkpoint, we prevent reassignment of the relfilenumber until it's
268 : * safe, because relfilenumber assignment skips over any existing file.
269 : *
270 : * Additional segments, if any, are truncated and then unlinked. The reason
271 : * for truncating is that other backends may still hold open FDs for these at
272 : * the smgr level, so that the kernel can't remove the file yet. We want to
273 : * reclaim the disk space right away despite that.
274 : *
275 : * We do not need to go through this dance for temp relations, though, because
276 : * we never make WAL entries for temp rels, and so a temp rel poses no threat
277 : * to the health of a regular rel that has taken over its relfilenumber.
278 : * The fact that temp rels and regular rels have different file naming
279 : * patterns provides additional safety. Other backends shouldn't have open
280 : * FDs for them, either.
281 : *
282 : * We also don't do it while performing a binary upgrade. There is no reuse
283 : * hazard in that case, since after a crash or even a simple ERROR, the
284 : * upgrade fails and the whole cluster must be recreated from scratch.
285 : * Furthermore, it is important to remove the files from disk immediately,
286 : * because we may be about to reuse the same relfilenumber.
287 : *
288 : * All the above applies only to the relation's main fork; other forks can
289 : * just be removed immediately, since they are not needed to prevent the
290 : * relfilenumber from being recycled. Also, we do not carefully
291 : * track whether other forks have been created or not, but just attempt to
292 : * unlink them unconditionally; so we should never complain about ENOENT.
293 : *
294 : * If isRedo is true, it's unsurprising for the relation to be already gone.
295 : * Also, we should remove the file immediately instead of queuing a request
296 : * for later, since during redo there's no possibility of creating a
297 : * conflicting relation.
298 : *
299 : * Note: we currently just never warn about ENOENT at all. We could warn in
300 : * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
301 : *
302 : * Note: any failure should be reported as WARNING not ERROR, because
303 : * we are usually not in a transaction anymore when this is called.
304 : */
305 : void
202 pg 306 GNC 149528 : mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
307 : {
308 : /* Now do the per-fork work */
309 149528 : if (forknum == InvalidForkNumber)
310 : {
202 pg 311 UNC 0 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
312 0 : mdunlinkfork(rlocator, forknum, isRedo);
313 : }
314 : else
202 pg 315 GNC 149528 : mdunlinkfork(rlocator, forknum, isRedo);
3916 tgl 316 CBC 149528 : }
317 :
318 : /*
859 tmunro 319 ECB : * Truncate a file to release disk space.
320 : */
859 tmunro 321 EUB : static int
859 tmunro 322 GBC 174303 : do_truncate(const char *path)
323 : {
324 : int save_errno;
859 tmunro 325 ECB : int ret;
326 :
859 tmunro 327 GIC 174303 : ret = pg_truncate(path, 0);
328 :
329 : /* Log a warning here to avoid repetition in callers. */
330 174303 : if (ret < 0 && errno != ENOENT)
331 : {
859 tmunro 332 LBC 0 : save_errno = errno;
859 tmunro 333 UIC 0 : ereport(WARNING,
334 : (errcode_for_file_access(),
335 : errmsg("could not truncate file \"%s\": %m", path)));
336 0 : errno = save_errno;
859 tmunro 337 ECB : }
338 :
859 tmunro 339 GIC 174303 : return ret;
859 tmunro 340 ECB : }
341 :
3916 tgl 342 EUB : static void
202 pg 343 GNC 149528 : mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
344 : {
345 : char *path;
3916 tgl 346 EUB : int ret;
347 : int save_errno;
348 :
202 pg 349 GNC 149528 : path = relpath(rlocator, forknum);
350 :
351 : /*
352 : * Truncate and then unlink the first segment, or just register a request
151 tgl 353 ECB : * to unlink it later, as described in the comments for mdunlink().
354 : */
151 tgl 355 GNC 149528 : if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
356 31279 : RelFileLocatorBackendIsTemp(rlocator))
357 : {
277 rhaas 358 121057 : if (!RelFileLocatorBackendIsTemp(rlocator))
859 tmunro 359 ECB : {
360 : /* Prevent other backends' fds from holding on to the disk space */
859 tmunro 361 GIC 109825 : ret = do_truncate(path);
362 :
363 : /* Forget any pending sync requests for the first segment */
153 tgl 364 109825 : save_errno = errno;
202 pg 365 GNC 109825 : register_forget_request(rlocator, forknum, 0 /* first seg */ );
153 tgl 366 CBC 109825 : errno = save_errno;
367 : }
859 tmunro 368 ECB : else
859 tmunro 369 GIC 11232 : ret = 0;
370 :
859 tmunro 371 ECB : /* Next unlink the file, unless it was already found to be missing */
151 tgl 372 GIC 121057 : if (ret >= 0 || errno != ENOENT)
373 : {
859 tmunro 374 CBC 18772 : ret = unlink(path);
375 18772 : if (ret < 0 && errno != ENOENT)
151 tgl 376 ECB : {
151 tgl 377 UIC 0 : save_errno = errno;
859 tmunro 378 0 : ereport(WARNING,
859 tmunro 379 ECB : (errcode_for_file_access(),
380 : errmsg("could not remove file \"%s\": %m", path)));
151 tgl 381 UIC 0 : errno = save_errno;
151 tgl 382 ECB : }
383 : }
4995 heikki.linnakangas 384 : }
5624 tgl 385 : else
386 : {
859 tmunro 387 EUB : /* Prevent other backends' fds from holding on to the disk space */
859 tmunro 388 GBC 28471 : ret = do_truncate(path);
389 :
390 : /* Register request to unlink first segment later */
151 tgl 391 28471 : save_errno = errno;
151 tgl 392 GNC 28471 : register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
151 tgl 393 GIC 28471 : errno = save_errno;
394 : }
395 :
396 : /*
397 : * Delete any additional segments.
151 tgl 398 ECB : *
399 : * Note that because we loop until getting ENOENT, we will correctly
400 : * remove all inactive segments as well as active ones. Ideally we'd
401 : * continue the loop until getting exactly that errno, but that risks an
402 : * infinite loop if the problem is directory-wide (for instance, if we
403 : * suddenly can't read the data directory itself). We compromise by
404 : * continuing after a non-ENOENT truncate error, but stopping after any
405 : * unlink error. If there is indeed a directory-wide problem, additional
406 : * unlink attempts wouldn't work anyway.
407 : */
151 tgl 408 GIC 149528 : if (ret >= 0 || errno != ENOENT)
409 : {
8187 410 38946 : char *segpath = (char *) palloc(strlen(path) + 12);
411 : BlockNumber segno;
412 :
151 413 38946 : for (segno = 1;; segno++)
414 : {
415 38946 : sprintf(segpath, "%s.%u", path, segno);
416 :
277 rhaas 417 GNC 38946 : if (!RelFileLocatorBackendIsTemp(rlocator))
859 tmunro 418 ECB : {
419 : /*
420 : * Prevent other backends' fds from holding on to the disk
421 : * space. We're done if we see ENOENT, though.
422 : */
859 tmunro 423 CBC 36007 : if (do_truncate(segpath) < 0 && errno == ENOENT)
859 tmunro 424 GIC 36007 : break;
859 tmunro 425 ECB :
426 : /*
427 : * Forget any pending sync requests for this segment before we
428 : * try to unlink.
429 : */
202 pg 430 UNC 0 : register_forget_request(rlocator, forknum, segno);
431 : }
432 :
8187 tgl 433 CBC 2939 : if (unlink(segpath) < 0)
8187 tgl 434 ECB : {
435 : /* ENOENT is expected after the last segment... */
8187 tgl 436 GIC 2939 : if (errno != ENOENT)
5940 tgl 437 UIC 0 : ereport(WARNING,
438 : (errcode_for_file_access(),
439 : errmsg("could not remove file \"%s\": %m", segpath)));
8187 tgl 440 GBC 2939 : break;
441 : }
442 : }
8187 tgl 443 CBC 38946 : pfree(segpath);
444 : }
445 :
446 149528 : pfree(path);
9770 scrappy 447 GBC 149528 : }
448 :
449 : /*
9345 bruce 450 ECB : * mdextend() -- Add a block to the specified relation.
451 : *
452 : * The semantics are nearly the same as mdwrite(): write at the
5940 tgl 453 : * specified position. However, this is to be used for the case of
454 : * extending a relation (i.e., blocknum is at or beyond the current
455 : * EOF). Note that we assume writing a block beyond current EOF
456 : * causes intervening file space to become filled with zeroes.
9770 scrappy 457 : */
458 : void
5354 heikki.linnakangas 459 GIC 177935 : mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
460 : const void *buffer, bool skipFsync)
461 : {
462 : off_t seekpos;
463 : int nbytes;
464 : MdfdVec *v;
465 :
466 : /* If this build supports direct I/O, the buffer must be I/O aligned. */
467 : if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
1 tmunro 468 GNC 177935 : Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
469 :
470 : /* This assert is too expensive to have on normally ... */
471 : #ifdef CHECK_WRITE_VS_EXTEND
472 : Assert(blocknum >= mdnblocks(reln, forknum));
5940 tgl 473 ECB : #endif
474 :
475 : /*
476 : * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
477 : * more --- we mustn't create a block whose number actually is
478 : * InvalidBlockNumber. (Note that this failure should be unreachable
479 : * because of upstream checks in bufmgr.c.)
480 : */
5940 tgl 481 GIC 177935 : if (blocknum == InvalidBlockNumber)
5940 tgl 482 LBC 0 : ereport(ERROR,
483 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
484 : errmsg("cannot extend file \"%s\" beyond %u blocks",
485 : relpath(reln->smgr_rlocator, forknum),
486 : InvalidBlockNumber)));
487 :
4622 rhaas 488 GIC 177935 : v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
489 :
2118 tgl 490 177935 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
491 :
5508 492 177935 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
493 :
1614 tmunro 494 177935 : if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
8586 inoue 495 ECB : {
5940 tgl 496 UBC 0 : if (nbytes < 0)
5940 tgl 497 UIC 0 : ereport(ERROR,
498 : (errcode_for_file_access(),
499 : errmsg("could not extend file \"%s\": %m",
500 : FilePathName(v->mdfd_vfd)),
501 : errhint("Check free disk space.")));
5940 tgl 502 ECB : /* short write: complain appropriately */
5940 tgl 503 UIC 0 : ereport(ERROR,
5940 tgl 504 ECB : (errcode(ERRCODE_DISK_FULL),
505 : errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
4995 heikki.linnakangas 506 : FilePathName(v->mdfd_vfd),
507 : nbytes, BLCKSZ, blocknum),
5940 tgl 508 : errhint("Check free disk space.")));
509 : }
9770 scrappy 510 EUB :
4622 rhaas 511 GBC 177935 : if (!skipFsync && !SmgrIsTemp(reln))
5354 heikki.linnakangas 512 GIC 28 : register_dirty_segment(reln, forknum, v);
513 :
514 177935 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
9770 scrappy 515 177935 : }
516 :
517 : /*
518 : * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
519 : *
520 : * Similar to mdextend(), except the relation can be extended by multiple
521 : * blocks at once and the added blocks will be filled with zeroes.
522 : */
523 : void
4 andres 524 GNC 343785 : mdzeroextend(SMgrRelation reln, ForkNumber forknum,
525 : BlockNumber blocknum, int nblocks, bool skipFsync)
526 : {
527 : MdfdVec *v;
528 343785 : BlockNumber curblocknum = blocknum;
529 343785 : int remblocks = nblocks;
530 :
531 343785 : Assert(nblocks > 0);
532 :
533 : /* This assert is too expensive to have on normally ... */
534 : #ifdef CHECK_WRITE_VS_EXTEND
535 : Assert(blocknum >= mdnblocks(reln, forknum));
536 : #endif
537 :
538 : /*
539 : * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
540 : * more --- we mustn't create a block whose number actually is
541 : * InvalidBlockNumber or larger.
542 : */
543 343785 : if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
4 andres 544 UNC 0 : ereport(ERROR,
545 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
546 : errmsg("cannot extend file \"%s\" beyond %u blocks",
547 : relpath(reln->smgr_rlocator, forknum),
548 : InvalidBlockNumber)));
549 :
4 andres 550 GNC 687570 : while (remblocks > 0)
551 : {
552 343785 : BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
553 343785 : off_t seekpos = (off_t) BLCKSZ * segstartblock;
554 : int numblocks;
555 :
556 343785 : if (segstartblock + remblocks > RELSEG_SIZE)
4 andres 557 UNC 0 : numblocks = RELSEG_SIZE - segstartblock;
558 : else
4 andres 559 GNC 343785 : numblocks = remblocks;
560 :
561 343785 : v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
562 :
563 343785 : Assert(segstartblock < RELSEG_SIZE);
564 343785 : Assert(segstartblock + numblocks <= RELSEG_SIZE);
565 :
566 : /*
567 : * If available and useful, use posix_fallocate() (via FileAllocate())
568 : * to extend the relation. That's often more efficient than using
569 : * write(), as it commonly won't cause the kernel to allocate page
570 : * cache space for the extended pages.
571 : *
572 : * However, we don't use FileAllocate() for small extensions, as it
573 : * defeats delayed allocation on some filesystems. Not clear where
574 : * that decision should be made though? For now just use a cutoff of
575 : * 8, anything between 4 and 8 worked OK in some local testing.
576 : */
577 343785 : if (numblocks > 8)
578 : {
579 : int ret;
580 :
581 385 : ret = FileFallocate(v->mdfd_vfd,
582 : seekpos, (off_t) BLCKSZ * numblocks,
583 : WAIT_EVENT_DATA_FILE_EXTEND);
584 385 : if (ret != 0)
585 : {
4 andres 586 UNC 0 : ereport(ERROR,
587 : errcode_for_file_access(),
588 : errmsg("could not extend file \"%s\" with FileFallocate(): %m",
589 : FilePathName(v->mdfd_vfd)),
590 : errhint("Check free disk space."));
591 : }
592 : }
593 : else
594 : {
595 : int ret;
596 :
597 : /*
598 : * Even if we don't want to use fallocate, we can still extend a
599 : * bit more efficiently than writing each 8kB block individually.
600 : * pg_pwrite_zeroes() (via FileZero()) uses
601 : * pg_pwritev_with_retry() to avoid multiple writes or needing a
602 : * zeroed buffer for the whole length of the extension.
603 : */
4 andres 604 GNC 343400 : ret = FileZero(v->mdfd_vfd,
605 : seekpos, (off_t) BLCKSZ * numblocks,
606 : WAIT_EVENT_DATA_FILE_EXTEND);
607 343400 : if (ret < 0)
4 andres 608 UNC 0 : ereport(ERROR,
609 : errcode_for_file_access(),
610 : errmsg("could not extend file \"%s\": %m",
611 : FilePathName(v->mdfd_vfd)),
612 : errhint("Check free disk space."));
613 : }
614 :
4 andres 615 GNC 343785 : if (!skipFsync && !SmgrIsTemp(reln))
616 332890 : register_dirty_segment(reln, forknum, v);
617 :
618 343785 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
619 :
620 343785 : remblocks -= numblocks;
621 343785 : curblocknum += numblocks;
622 : }
623 343785 : }
624 :
9770 scrappy 625 EUB : /*
626 : * mdopenfork() -- Open one fork of the specified relation.
627 : *
628 : * Note we only open the first segment, when there are multiple segments.
629 : *
630 : * If first segment is not present, either ereport or return NULL according
631 : * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
632 : * EXTENSION_CREATE means it's OK to extend an existing relation, not to
5940 tgl 633 ECB : * invent one out of whole cloth.
9770 scrappy 634 : */
635 : static MdfdVec *
1362 tmunro 636 CBC 3993243 : mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
9770 scrappy 637 ECB : {
638 : MdfdVec *mdfd;
639 : char *path;
640 : File fd;
641 :
642 : /* No work if already open */
2404 andres 643 GIC 3993243 : if (reln->md_num_open_segs[forknum] > 0)
644 2536346 : return &reln->md_seg_fds[forknum][0];
645 :
277 rhaas 646 GNC 1456897 : path = relpath(reln->smgr_rlocator, forknum);
647 :
1 tmunro 648 1456897 : fd = PathNameOpenFile(path, _mdfd_open_flags());
649 :
9345 bruce 650 CBC 1456897 : if (fd < 0)
8617 tgl 651 ECB : {
1532 akapila 652 GIC 409266 : if ((behavior & EXTENSION_RETURN_NULL) &&
1532 akapila 653 CBC 409245 : FILE_POSSIBLY_DELETED(errno))
654 : {
1532 akapila 655 GIC 409245 : pfree(path);
656 409245 : return NULL;
657 : }
658 21 : ereport(ERROR,
659 : (errcode_for_file_access(),
660 : errmsg("could not open file \"%s\": %m", path)));
661 : }
662 :
8187 tgl 663 1047631 : pfree(path);
664 :
2404 andres 665 CBC 1047631 : _fdvec_resize(reln, forknum, 1);
2404 andres 666 GBC 1047631 : mdfd = &reln->md_seg_fds[forknum][0];
6887 tgl 667 GIC 1047631 : mdfd->mdfd_vfd = fd;
668 1047631 : mdfd->mdfd_segno = 0;
669 :
5354 heikki.linnakangas 670 1047631 : Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
671 :
6887 tgl 672 CBC 1047631 : return mdfd;
673 : }
9770 scrappy 674 ECB :
1362 tmunro 675 : /*
676 : * mdopen() -- Initialize newly-opened relation.
677 : */
678 : void
1362 tmunro 679 GBC 1342539 : mdopen(SMgrRelation reln)
680 : {
1362 tmunro 681 ECB : /* mark it not open */
1362 tmunro 682 GIC 6712695 : for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
1362 tmunro 683 CBC 5370156 : reln->md_num_open_segs[forknum] = 0;
1362 tmunro 684 GIC 1342539 : }
1362 tmunro 685 ECB :
9770 scrappy 686 : /*
687 : * mdclose() -- Close the specified relation, if it isn't closed already.
688 : */
689 : void
5354 heikki.linnakangas 690 GIC 4185971 : mdclose(SMgrRelation reln, ForkNumber forknum)
691 : {
2404 andres 692 4185971 : int nopensegs = reln->md_num_open_segs[forknum];
693 :
694 : /* No work if already closed */
695 4185971 : if (nopensegs == 0)
5940 tgl 696 3212515 : return;
697 :
698 : /* close segments starting from the end */
2404 andres 699 CBC 1946912 : while (nopensegs > 0)
700 : {
2404 andres 701 GIC 973456 : MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
702 :
1185 noah 703 CBC 973456 : FileClose(v->mdfd_vfd);
1185 noah 704 GIC 973456 : _fdvec_resize(reln, forknum, nopensegs - 1);
2404 andres 705 973456 : nopensegs--;
9345 bruce 706 ECB : }
707 : }
9770 scrappy 708 EUB :
709 : /*
710 : * mdprefetch() -- Initiate asynchronous read of the specified block of a relation
711 : */
712 : bool
5200 tgl 713 GIC 212103 : mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
714 : {
715 : #ifdef USE_PREFETCH
716 : off_t seekpos;
717 : MdfdVec *v;
718 :
1 tmunro 719 GNC 212103 : Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
720 :
1096 tmunro 721 GIC 212103 : v = _mdfd_getseg(reln, forknum, blocknum, false,
722 212103 : InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
723 212103 : if (v == NULL)
1096 tmunro 724 UIC 0 : return false;
725 :
2118 tgl 726 GIC 212103 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
727 :
5200 tgl 728 CBC 212103 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
729 :
2213 rhaas 730 GIC 212103 : (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
2118 tgl 731 ECB : #endif /* USE_PREFETCH */
1096 tmunro 732 EUB :
1096 tmunro 733 GIC 212103 : return true;
734 : }
735 :
736 : /*
737 : * mdwriteback() -- Tell the kernel to write pages back to storage.
738 : *
2606 andres 739 ECB : * This accepts a range of blocks because flushing several pages at once is
740 : * considerably more efficient than doing so individually.
741 : */
742 : void
2552 tgl 743 GIC 134299 : mdwriteback(SMgrRelation reln, ForkNumber forknum,
2552 tgl 744 ECB : BlockNumber blocknum, BlockNumber nblocks)
2606 andres 745 : {
1 tmunro 746 GNC 134299 : Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
747 :
748 : /*
2606 andres 749 ECB : * Issue flush requests in as few requests as possible; have to split at
750 : * segment boundaries though, since those are actually separate files.
751 : */
2552 tgl 752 GIC 268321 : while (nblocks > 0)
753 : {
754 134299 : BlockNumber nflush = nblocks;
755 : off_t seekpos;
756 : MdfdVec *v;
757 : int segnum_start,
758 : segnum_end;
759 :
2531 andres 760 134299 : v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
761 : EXTENSION_DONT_OPEN);
2606 andres 762 ECB :
763 : /*
764 : * We might be flushing buffers of already removed relations, that's
765 : * ok, just ignore that case. If the segment file wasn't open already
766 : * (ie from a recent mdwrite()), then we don't want to re-open it, to
767 : * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
768 : * us with a descriptor to a file that is about to be unlinked.
769 : */
2606 andres 770 CBC 134299 : if (!v)
2606 andres 771 GIC 277 : return;
2606 andres 772 ECB :
773 : /* compute offset inside the current segment */
2606 andres 774 CBC 134022 : segnum_start = blocknum / RELSEG_SIZE;
775 :
2606 andres 776 ECB : /* compute number of desired writes within the current segment */
2606 andres 777 GIC 134022 : segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
2606 andres 778 CBC 134022 : if (segnum_start != segnum_end)
2606 andres 779 LBC 0 : nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
780 :
2606 andres 781 CBC 134022 : Assert(nflush >= 1);
782 134022 : Assert(nflush <= nblocks);
783 :
2118 tgl 784 134022 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
785 :
2213 rhaas 786 GIC 134022 : FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
787 :
2606 andres 788 134022 : nblocks -= nflush;
2606 andres 789 CBC 134022 : blocknum += nflush;
790 : }
2606 andres 791 ECB : }
5200 tgl 792 :
9770 scrappy 793 : /*
9345 bruce 794 : * mdread() -- Read the specified block from a relation.
795 : */
5940 tgl 796 : void
5354 heikki.linnakangas 797 GIC 1310557 : mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
798 : void *buffer)
799 : {
800 : off_t seekpos;
801 : int nbytes;
802 : MdfdVec *v;
803 :
804 : /* If this build supports direct I/O, the buffer must be I/O aligned. */
805 : if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
1 tmunro 806 GNC 1310557 : Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
807 :
808 : TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
809 : reln->smgr_rlocator.locator.spcOid,
810 : reln->smgr_rlocator.locator.dbOid,
811 : reln->smgr_rlocator.locator.relNumber,
812 : reln->smgr_rlocator.backend);
5226 bruce 813 ECB :
2531 andres 814 CBC 1310557 : v = _mdfd_getseg(reln, forknum, blocknum, false,
815 : EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
816 :
2118 tgl 817 GIC 1310544 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
818 :
5508 819 1310544 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
9770 scrappy 820 ECB :
1614 tmunro 821 GIC 1310544 : nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
5226 bruce 822 ECB :
823 : TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
824 : reln->smgr_rlocator.locator.spcOid,
825 : reln->smgr_rlocator.locator.dbOid,
826 : reln->smgr_rlocator.locator.relNumber,
827 : reln->smgr_rlocator.backend,
828 : nbytes,
5142 tgl 829 : BLCKSZ);
830 :
5226 bruce 831 CBC 1310544 : if (nbytes != BLCKSZ)
832 : {
5940 tgl 833 LBC 0 : if (nbytes < 0)
834 0 : ereport(ERROR,
5940 tgl 835 ECB : (errcode_for_file_access(),
836 : errmsg("could not read block %u in file \"%s\": %m",
837 : blocknum, FilePathName(v->mdfd_vfd))));
838 :
839 : /*
840 : * Short read: we are at or past EOF, or we read a partial block at
841 : * EOF. Normally this is an error; upper levels should never try to
842 : * read a nonexistent block. However, if zero_damaged_pages is ON or
5624 bruce 843 : * we are InRecovery, we should instead return zeroes without
844 : * complaining. This allows, for example, the case of trying to
845 : * update a block that was later truncated away.
846 : */
5940 tgl 847 UIC 0 : if (zero_damaged_pages || InRecovery)
8586 inoue 848 0 : MemSet(buffer, 0, BLCKSZ);
9345 bruce 849 ECB : else
5940 tgl 850 UIC 0 : ereport(ERROR,
5940 tgl 851 ECB : (errcode(ERRCODE_DATA_CORRUPTED),
4995 heikki.linnakangas 852 : errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
853 : blocknum, FilePathName(v->mdfd_vfd),
5940 tgl 854 EUB : nbytes, BLCKSZ)));
855 : }
9770 scrappy 856 CBC 1310544 : }
857 :
9770 scrappy 858 ECB : /*
859 : * mdwrite() -- Write the supplied block at the appropriate location.
5940 tgl 860 : *
861 : * This is to be used only for updating already-existing blocks of a
862 : * relation (ie, those before the current EOF). To extend a relation,
863 : * use mdextend().
864 : */
865 : void
5354 heikki.linnakangas 866 GIC 748947 : mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
867 : const void *buffer, bool skipFsync)
868 : {
869 : off_t seekpos;
870 : int nbytes;
871 : MdfdVec *v;
872 :
873 : /* If this build supports direct I/O, the buffer must be I/O aligned. */
874 : if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
1 tmunro 875 GNC 748947 : Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
876 :
5940 tgl 877 ECB : /* This assert is too expensive to have on normally ... */
878 : #ifdef CHECK_WRITE_VS_EXTEND
879 : Assert(blocknum < mdnblocks(reln, forknum));
880 : #endif
881 :
882 : TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
883 : reln->smgr_rlocator.locator.spcOid,
884 : reln->smgr_rlocator.locator.dbOid,
885 : reln->smgr_rlocator.locator.relNumber,
886 : reln->smgr_rlocator.backend);
887 :
2531 andres 888 CBC 748947 : v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
889 : EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
890 :
2118 tgl 891 GIC 748947 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
892 :
5508 893 748947 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
9770 scrappy 894 ECB :
1614 tmunro 895 GIC 748947 : nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
896 :
897 : TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
898 : reln->smgr_rlocator.locator.spcOid,
899 : reln->smgr_rlocator.locator.dbOid,
900 : reln->smgr_rlocator.locator.relNumber,
901 : reln->smgr_rlocator.backend,
902 : nbytes,
903 : BLCKSZ);
5226 bruce 904 ECB :
5226 bruce 905 CBC 748947 : if (nbytes != BLCKSZ)
906 : {
5940 tgl 907 UIC 0 : if (nbytes < 0)
5940 tgl 908 LBC 0 : ereport(ERROR,
909 : (errcode_for_file_access(),
910 : errmsg("could not write block %u in file \"%s\": %m",
4995 heikki.linnakangas 911 ECB : blocknum, FilePathName(v->mdfd_vfd))));
5940 tgl 912 : /* short write: complain appropriately */
5940 tgl 913 UBC 0 : ereport(ERROR,
914 : (errcode(ERRCODE_DISK_FULL),
4995 heikki.linnakangas 915 ECB : errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
5940 tgl 916 : blocknum,
917 : FilePathName(v->mdfd_vfd),
918 : nbytes, BLCKSZ),
919 : errhint("Check free disk space.")));
6887 920 : }
921 :
4622 rhaas 922 CBC 748947 : if (!skipFsync && !SmgrIsTemp(reln))
5354 heikki.linnakangas 923 725820 : register_dirty_segment(reln, forknum, v);
8400 tgl 924 GIC 748947 : }
925 :
926 : /*
927 : * mdnblocks() -- Get the number of blocks stored in a relation.
928 : *
929 : * Important side effect: all active segments of the relation are opened
930 : * and added to the md_seg_fds array. If this routine has not been
8620 tgl 931 ECB : * called, then only segments up to the last one actually touched
932 : * are present in the array.
933 : */
934 : BlockNumber
5354 heikki.linnakangas 935 GIC 2682691 : mdnblocks(SMgrRelation reln, ForkNumber forknum)
936 : {
937 : MdfdVec *v;
938 : BlockNumber nblocks;
939 : BlockNumber segno;
948 bruce 940 ECB :
948 bruce 941 GIC 2682691 : mdopenfork(reln, forknum, EXTENSION_FAIL);
942 :
943 : /* mdopen has opened the first segment */
2404 andres 944 2682673 : Assert(reln->md_num_open_segs[forknum] > 0);
945 :
946 : /*
947 : * Start from the last open segments, to avoid redundant seeks. We have
2404 andres 948 ECB : * previously verified that these segments are exactly RELSEG_SIZE long,
949 : * and it's useless to recheck that each time.
950 : *
5984 tgl 951 : * NOTE: this assumption could only be wrong if another backend has
952 : * truncated the relation. We rely on higher code levels to handle that
953 : * scenario by closing and re-opening the md fd, which is handled via
954 : * relcache flush. (Since the checkpointer doesn't participate in
2404 andres 955 : * relcache flush, it could have segment entries for inactive segments;
956 : * that's OK because the checkpointer never needs to compute relation
957 : * size.)
958 : */
2404 andres 959 GIC 2682673 : segno = reln->md_num_open_segs[forknum] - 1;
960 2682673 : v = &reln->md_seg_fds[forknum][segno];
961 :
962 : for (;;)
963 : {
5354 heikki.linnakangas 964 2682673 : nblocks = _mdnblocks(reln, forknum, v);
7956 tgl 965 CBC 2682673 : if (nblocks > ((BlockNumber) RELSEG_SIZE))
7199 tgl 966 UIC 0 : elog(FATAL, "segment too big");
7956 tgl 967 GBC 2682673 : if (nblocks < ((BlockNumber) RELSEG_SIZE))
968 2682673 : return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
969 :
970 : /*
971 : * If segment is exactly RELSEG_SIZE, advance to next one.
972 : */
8004 tgl 973 UIC 0 : segno++;
974 :
975 : /*
976 : * We used to pass O_CREAT here, but that has the disadvantage that it
977 : * might create a segment which has vanished through some operating
978 : * system misadventure. In such a case, creating the segment here
979 : * undermines _mdfd_getseg's attempts to notice and report an error
980 : * upon access to a missing segment.
2404 andres 981 EUB : */
2404 andres 982 UBC 0 : v = _mdfd_openseg(reln, forknum, segno, 0);
2404 andres 983 UIC 0 : if (v == NULL)
2404 andres 984 UBC 0 : return segno * ((BlockNumber) RELSEG_SIZE);
985 : }
986 : }
987 :
988 : /*
989 : * mdtruncate() -- Truncate relation to specified number of blocks.
9629 vadim4o 990 ECB : */
991 : void
4622 rhaas 992 GIC 663 : mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
993 : {
994 : BlockNumber curnblk;
995 : BlockNumber priorblocks;
996 : int curopensegs;
997 :
998 : /*
999 : * NOTE: mdnblocks makes sure we have opened all active segments, so that
5624 bruce 1000 ECB : * truncation loop will get them all!
1001 : */
5354 heikki.linnakangas 1002 GIC 663 : curnblk = mdnblocks(reln, forknum);
7956 tgl 1003 663 : if (nblocks > curnblk)
1004 : {
1005 : /* Bogus request ... but no complaint if InRecovery */
5940 tgl 1006 UIC 0 : if (InRecovery)
1007 0 : return;
1008 0 : ereport(ERROR,
4995 heikki.linnakangas 1009 ECB : (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1010 : relpath(reln->smgr_rlocator, forknum),
1011 : nblocks, curnblk)));
1012 : }
8620 tgl 1013 GIC 663 : if (nblocks == curnblk)
5940 1014 276 : return; /* no work */
1015 :
1016 : /*
1017 : * Truncate segments, starting at the last one. Starting at the end makes
1018 : * managing the memory for the fd array easier, should there be errors.
1019 : */
2404 andres 1020 387 : curopensegs = reln->md_num_open_segs[forknum];
1021 774 : while (curopensegs > 0)
8696 bruce 1022 ECB : {
1023 : MdfdVec *v;
1024 :
2404 andres 1025 CBC 387 : priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1026 :
1027 387 : v = &reln->md_seg_fds[forknum][curopensegs - 1];
1028 :
8620 tgl 1029 387 : if (priorblocks > nblocks)
1030 : {
1031 : /*
1032 : * This segment is no longer active. We truncate the file, but do
1033 : * not delete it, for reasons explained in the header comments.
1034 : */
2213 rhaas 1035 UIC 0 : if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
5940 tgl 1036 0 : ereport(ERROR,
1037 : (errcode_for_file_access(),
1038 : errmsg("could not truncate file \"%s\": %m",
4995 heikki.linnakangas 1039 ECB : FilePathName(v->mdfd_vfd))));
1040 :
4622 rhaas 1041 UBC 0 : if (!SmgrIsTemp(reln))
5354 heikki.linnakangas 1042 0 : register_dirty_segment(reln, forknum, v);
1043 :
1044 : /* we never drop the 1st segment */
2404 andres 1045 UIC 0 : Assert(v != &reln->md_seg_fds[forknum][0]);
1046 :
2404 andres 1047 UBC 0 : FileClose(v->mdfd_vfd);
2404 andres 1048 UIC 0 : _fdvec_resize(reln, forknum, curopensegs - 1);
1049 : }
7956 tgl 1050 GIC 387 : else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1051 : {
1052 : /*
1053 : * This is the last segment we want to keep. Truncate the file to
1054 : * the right length. NOTE: if nblocks is exactly a multiple K of
1055 : * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
2404 andres 1056 ECB : * keep it. This adheres to the invariant given in the header
1057 : * comments.
8620 tgl 1058 : */
7836 bruce 1059 GIC 387 : BlockNumber lastsegblocks = nblocks - priorblocks;
1060 :
2213 rhaas 1061 387 : if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
5940 tgl 1062 UIC 0 : ereport(ERROR,
1063 : (errcode_for_file_access(),
1064 : errmsg("could not truncate file \"%s\" to %u blocks: %m",
1065 : FilePathName(v->mdfd_vfd),
1066 : nblocks)));
4622 rhaas 1067 GIC 387 : if (!SmgrIsTemp(reln))
5354 heikki.linnakangas 1068 250 : register_dirty_segment(reln, forknum, v);
8620 tgl 1069 ECB : }
1070 : else
1071 : {
1072 : /*
1073 : * We still need this segment, so nothing to do for this and any
1074 : * earlier segment.
1075 : */
2404 andres 1076 UIC 0 : break;
1077 : }
2404 andres 1078 CBC 387 : curopensegs--;
1079 : }
1080 : }
1081 :
1082 : /*
1083 : * mdimmedsync() -- Immediately sync a relation to stable storage.
1084 : *
1085 : * Note that only writes already issued are synced; this routine knows
1086 : * nothing of dirty buffers that may exist inside the buffer manager. We
1087 : * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1088 : * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1089 : * some segment, then mdtruncate() renders that segment inactive. If we
1090 : * crash before the next checkpoint syncs the newly-inactive segment, that
1091 : * segment may survive recovery, reintroducing unwanted data into the table.
1092 : */
5940 tgl 1093 ECB : void
5354 heikki.linnakangas 1094 CBC 59652 : mdimmedsync(SMgrRelation reln, ForkNumber forknum)
1095 : {
1096 : int segno;
1097 : int min_inactive_seg;
6885 tgl 1098 ECB :
1099 : /*
5624 bruce 1100 EUB : * NOTE: mdnblocks makes sure we have opened all active segments, so that
5624 bruce 1101 ECB : * fsync loop will get them all!
6885 tgl 1102 : */
4381 peter_e 1103 GIC 59652 : mdnblocks(reln, forknum);
1104 :
1100 noah 1105 59652 : min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1106 :
1100 noah 1107 EUB : /*
1108 : * Temporarily open inactive segments, then close them after sync. There
1109 : * may be some inactive segments left opened after fsync() error, but that
1110 : * is harmless. We don't bother to clean them up and take a risk of
1111 : * further trouble. The next mdclose() will soon close them.
1112 : */
1100 noah 1113 GIC 59652 : while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1100 noah 1114 UIC 0 : segno++;
1115 :
2404 andres 1116 GBC 119304 : while (segno > 0)
6885 tgl 1117 EUB : {
2404 andres 1118 GBC 59652 : MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1119 :
1120 : /*
1121 : * fsyncs done through mdimmedsync() should be tracked in a separate
1122 : * IOContext than those done through mdsyncfiletag() to differentiate
1123 : * between unavoidable client backend fsyncs (e.g. those done during
1124 : * index build) and those which ideally would have been done by the
1125 : * checkpointer. Since other IO operations bypassing the buffer
1126 : * manager could also be tracked in such an IOContext, wait until
1127 : * these are also tracked to track immediate fsyncs.
1128 : */
2213 rhaas 1129 GIC 59652 : if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1602 tmunro 1130 UIC 0 : ereport(data_sync_elevel(ERROR),
1131 : (errcode_for_file_access(),
1132 : errmsg("could not fsync file \"%s\": %m",
1133 : FilePathName(v->mdfd_vfd))));
1134 :
1100 noah 1135 ECB : /* Close inactive segments immediately */
1100 noah 1136 GIC 59652 : if (segno > min_inactive_seg)
1137 : {
1100 noah 1138 UIC 0 : FileClose(v->mdfd_vfd);
1139 0 : _fdvec_resize(reln, forknum, segno - 1);
1140 : }
1141 :
2404 andres 1142 GIC 59652 : segno--;
1143 : }
6885 tgl 1144 59652 : }
6885 tgl 1145 ECB :
9770 scrappy 1146 : /*
1147 : * register_dirty_segment() -- Mark a relation segment as needing fsync
1148 : *
6887 tgl 1149 EUB : * If there is a local pending-ops table, just make an entry in it for
1466 tmunro 1150 : * ProcessSyncRequests to process later. Otherwise, try to pass off the
1151 : * fsync request to the checkpointer process. If that fails, just do the
1152 : * fsync locally before returning (we hope this will not happen often
1153 : * enough to be a performance problem).
1154 : */
1155 : static void
5354 heikki.linnakangas 1156 CBC 1058988 : register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
9770 scrappy 1157 ECB : {
1158 : FileTag tag;
1159 :
277 rhaas 1160 GNC 1058988 : INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1161 :
1162 : /* Temp relations should never be fsync'd */
3918 tgl 1163 CBC 1058988 : Assert(!SmgrIsTemp(reln));
3918 tgl 1164 ECB :
1466 tmunro 1165 GIC 1058988 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1166 : {
1167 : instr_time io_start;
1168 :
2 andres 1169 135 : ereport(DEBUG1,
2 andres 1170 ECB : (errmsg_internal("could not forward fsync request because request queue is full")));
1171 :
2 andres 1172 GNC 135 : io_start = pgstat_prepare_io_time();
1173 :
2 andres 1174 CBC 135 : if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
2 andres 1175 UIC 0 : ereport(data_sync_elevel(ERROR),
2 andres 1176 ECB : (errcode_for_file_access(),
1177 : errmsg("could not fsync file \"%s\": %m",
1178 : FilePathName(seg->mdfd_vfd))));
1179 :
1180 : /*
1181 : * We have no way of knowing if the current IOContext is
1182 : * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1183 : * point, so count the fsync as being in the IOCONTEXT_NORMAL
1184 : * IOContext. This is probably okay, because the number of backend
1185 : * fsyncs doesn't say anything about the efficacy of the
1186 : * BufferAccessStrategy. And counting both fsyncs done in
1187 : * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1188 : * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1189 : * backend fsyncs.
1190 : */
2 andres 1191 GNC 135 : pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1192 : IOOP_FSYNC, io_start, 1);
1193 : }
9770 scrappy 1194 GIC 1058988 : }
1195 :
5624 tgl 1196 EUB : /*
1401 akapila 1197 : * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1198 : */
1199 : static void
277 rhaas 1200 GNC 28471 : register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
1201 : BlockNumber segno)
5624 tgl 1202 EUB : {
1466 tmunro 1203 : FileTag tag;
1204 :
277 rhaas 1205 GNC 28471 : INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1466 tmunro 1206 EUB :
1207 : /* Should never be used with temp relations */
277 rhaas 1208 GNC 28471 : Assert(!RelFileLocatorBackendIsTemp(rlocator));
3918 tgl 1209 EUB :
1466 tmunro 1210 GIC 28471 : RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
5624 tgl 1211 CBC 28471 : }
1212 :
1213 : /*
1214 : * register_forget_request() -- forget any fsyncs for a relation fork's segment
1215 : */
1216 : static void
277 rhaas 1217 GNC 109825 : register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
1218 : BlockNumber segno)
1219 : {
1466 tmunro 1220 ECB : FileTag tag;
1221 :
277 rhaas 1222 GNC 109825 : INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
5624 bruce 1223 EUB :
1466 tmunro 1224 GIC 109825 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
5926 tgl 1225 109825 : }
1226 :
1227 : /*
1445 fujii 1228 ECB : * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
5926 tgl 1229 : */
1230 : void
1466 tmunro 1231 GIC 29 : ForgetDatabaseSyncRequests(Oid dbid)
1232 : {
1233 : FileTag tag;
1234 : RelFileLocator rlocator;
1235 :
277 rhaas 1236 GNC 29 : rlocator.dbOid = dbid;
1237 29 : rlocator.spcOid = 0;
1238 29 : rlocator.relNumber = 0;
5926 tgl 1239 ECB :
277 rhaas 1240 GNC 29 : INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
1241 :
1466 tmunro 1242 GIC 29 : RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
8198 vadim4o 1243 29 : }
1244 :
1245 : /*
1246 : * DropRelationFiles -- drop files of all given relations
1247 : */
1248 : void
277 rhaas 1249 GNC 2215 : DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
1250 : {
1251 : SMgrRelation *srels;
1252 : int i;
1253 :
1739 fujii 1254 GIC 2215 : srels = palloc(sizeof(SMgrRelation) * ndelrels);
1739 fujii 1255 CBC 8330 : for (i = 0; i < ndelrels; i++)
1256 : {
1739 fujii 1257 GIC 6115 : SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
1258 :
1259 6115 : if (isRedo)
1260 : {
1261 : ForkNumber fork;
1262 :
1263 30455 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
1739 fujii 1264 CBC 24364 : XLogDropRelation(delrels[i], fork);
1265 : }
1266 6115 : srels[i] = srel;
1267 : }
1268 :
1739 fujii 1269 GIC 2215 : smgrdounlinkall(srels, ndelrels, isRedo);
1270 :
1474 tomas.vondra 1271 8330 : for (i = 0; i < ndelrels; i++)
1739 fujii 1272 6115 : smgrclose(srels[i]);
1273 2215 : pfree(srels);
1739 fujii 1274 CBC 2215 : }
1739 fujii 1275 EUB :
1276 :
9770 scrappy 1277 ECB : /*
1278 : * _fdvec_resize() -- Resize the fork's open segments array
1279 : */
1280 : static void
2404 andres 1281 GIC 2239960 : _fdvec_resize(SMgrRelation reln,
1282 : ForkNumber forknum,
1283 : int nseg)
1284 : {
1285 2239960 : if (nseg == 0)
1286 : {
1287 973456 : if (reln->md_num_open_segs[forknum] > 0)
1288 : {
1289 973456 : pfree(reln->md_seg_fds[forknum]);
2404 andres 1290 CBC 973456 : reln->md_seg_fds[forknum] = NULL;
2404 andres 1291 EUB : }
1292 : }
2404 andres 1293 GIC 1266504 : else if (reln->md_num_open_segs[forknum] == 0)
1294 : {
1295 1266504 : reln->md_seg_fds[forknum] =
1296 1266504 : MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
2404 andres 1297 ECB : }
1298 : else
2404 andres 1299 EUB : {
1300 : /*
1301 : * It doesn't seem worthwhile complicating the code to amortize
1302 : * repalloc() calls. Those are far faster than PathNameOpenFile() or
1185 noah 1303 ECB : * FileClose(), and the memory context internally will sometimes avoid
1304 : * doing an actual reallocation.
2404 andres 1305 : */
2404 andres 1306 UIC 0 : reln->md_seg_fds[forknum] =
1307 0 : repalloc(reln->md_seg_fds[forknum],
1308 : sizeof(MdfdVec) * nseg);
1309 : }
1310 :
2404 andres 1311 GIC 2239960 : reln->md_num_open_segs[forknum] = nseg;
9453 vadim4o 1312 2239960 : }
1313 :
1314 : /*
1315 : * Return the filename for the specified segment of the relation. The
1316 : * returned string is palloc'd.
9453 vadim4o 1317 ECB : */
1318 : static char *
4995 heikki.linnakangas 1319 GIC 59662 : _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1320 : {
4790 bruce 1321 ECB : char *path,
1322 : *fullpath;
1323 :
277 rhaas 1324 GNC 59662 : path = relpath(reln->smgr_rlocator, forknum);
1325 :
9345 bruce 1326 CBC 59662 : if (segno > 0)
1327 : {
3380 peter_e 1328 GIC 59662 : fullpath = psprintf("%s.%u", path, segno);
8400 tgl 1329 59662 : pfree(path);
9345 bruce 1330 ECB : }
1331 : else
9345 bruce 1332 UIC 0 : fullpath = path;
9345 bruce 1333 ECB :
4995 heikki.linnakangas 1334 GIC 59662 : return fullpath;
4995 heikki.linnakangas 1335 ECB : }
4995 heikki.linnakangas 1336 EUB :
1337 : /*
1338 : * Open the specified segment of the relation,
1339 : * and make a MdfdVec object for it. Returns NULL on failure.
1340 : */
1341 : static MdfdVec *
4995 heikki.linnakangas 1342 GIC 59652 : _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1343 : int oflags)
1344 : {
1345 : MdfdVec *v;
1346 : File fd;
1347 : char *fullpath;
1348 :
1349 59652 : fullpath = _mdfd_segpath(reln, forknum, segno);
1350 :
1351 : /* open the file */
1 tmunro 1352 GNC 59652 : fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
1353 :
8400 tgl 1354 GIC 59652 : pfree(fullpath);
9345 bruce 1355 ECB :
9345 bruce 1356 GIC 59652 : if (fd < 0)
7032 neilc 1357 59652 : return NULL;
1358 :
1359 : /*
1360 : * Segments are always opened in order from lowest to highest, so we must
1168 tmunro 1361 ECB : * be adding a new one at the end.
1362 : */
1168 tmunro 1363 UIC 0 : Assert(segno == reln->md_num_open_segs[forknum]);
1364 :
1365 0 : _fdvec_resize(reln, forknum, segno + 1);
9345 bruce 1366 ECB :
1367 : /* fill the entry */
2404 andres 1368 UIC 0 : v = &reln->md_seg_fds[forknum][segno];
9345 bruce 1369 LBC 0 : v->mdfd_vfd = fd;
6887 tgl 1370 UIC 0 : v->mdfd_segno = segno;
2404 andres 1371 ECB :
5354 heikki.linnakangas 1372 LBC 0 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1373 :
1374 : /* all done */
8986 bruce 1375 UIC 0 : return v;
1376 : }
1377 :
7033 neilc 1378 ECB : /*
1379 : * _mdfd_getseg() -- Find the segment of the relation holding the
1380 : * specified block.
1381 : *
1382 : * If the segment doesn't exist, we ereport, return NULL, or create the
4622 rhaas 1383 : * segment, according to "behavior". Note: skipFsync is only used in the
1384 : * EXTENSION_CREATE case.
7033 neilc 1385 : */
8620 tgl 1386 : static MdfdVec *
5354 heikki.linnakangas 1387 GIC 2927626 : _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1388 : bool skipFsync, int behavior)
1389 : {
1390 : MdfdVec *v;
1391 : BlockNumber targetseg;
6887 tgl 1392 ECB : BlockNumber nextsegno;
1393 :
1394 : /* some way to handle non-existent segments needs to be specified */
2531 andres 1395 GIC 2927626 : Assert(behavior &
1396 : (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
337 tmunro 1397 ECB : EXTENSION_DONT_OPEN));
2531 andres 1398 :
5940 tgl 1399 CBC 2927626 : targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1400 :
2404 andres 1401 ECB : /* if an existing and opened segment, we're done */
2404 andres 1402 GIC 2927626 : if (targetseg < reln->md_num_open_segs[forknum])
6887 tgl 1403 ECB : {
2404 andres 1404 CBC 2678715 : v = &reln->md_seg_fds[forknum][targetseg];
2404 andres 1405 GIC 2678715 : return v;
1406 : }
1407 :
1408 : /* The caller only wants the segment if we already had it open. */
337 tmunro 1409 248911 : if (behavior & EXTENSION_DONT_OPEN)
337 tmunro 1410 CBC 277 : return NULL;
1411 :
1412 : /*
1413 : * The target segment is not yet open. Iterate over all the segments
1414 : * between the last opened and the target segment. This way missing
2404 andres 1415 ECB : * segments either raise an error, or get created (according to
1416 : * 'behavior'). Start with either the last opened, or the first segment if
1417 : * none was opened before.
1418 : */
2404 andres 1419 GIC 248634 : if (reln->md_num_open_segs[forknum] > 0)
2404 andres 1420 CBC 10 : v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1421 : else
1422 : {
1362 tmunro 1423 GIC 248624 : v = mdopenfork(reln, forknum, behavior);
2404 andres 1424 CBC 248621 : if (!v)
2404 andres 1425 LBC 0 : return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1426 : }
2404 andres 1427 ECB :
2404 andres 1428 GIC 248631 : for (nextsegno = reln->md_num_open_segs[forknum];
1429 248631 : nextsegno <= targetseg; nextsegno++)
2404 andres 1430 ECB : {
2404 andres 1431 GIC 10 : BlockNumber nblocks = _mdnblocks(reln, forknum, v);
2404 andres 1432 CBC 10 : int flags = 0;
2539 andres 1433 ECB :
2404 andres 1434 CBC 10 : Assert(nextsegno == v->mdfd_segno + 1);
2404 andres 1435 ECB :
2404 andres 1436 GIC 10 : if (nblocks > ((BlockNumber) RELSEG_SIZE))
2404 andres 1437 UIC 0 : elog(FATAL, "segment too big");
1438 :
2404 andres 1439 GIC 10 : if ((behavior & EXTENSION_CREATE) ||
1440 10 : (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1441 : {
2404 andres 1442 ECB : /*
1443 : * Normally we will create new segments only if authorized by the
1444 : * caller (i.e., we are doing mdextend()). But when doing WAL
1445 : * recovery, create segments anyway; this allows cases such as
1446 : * replaying WAL data that has a write into a high-numbered
1447 : * segment of a relation that was later deleted. We want to go
1448 : * ahead and create the segments so we can finish out the replay.
1449 : *
1450 : * We have to maintain the invariant that segments before the last
1451 : * active segment are of size RELSEG_SIZE; therefore, if
1452 : * extending, pad them out with zeroes if needed. (This only
1453 : * matters if in recovery, or if the caller is extending the
1454 : * relation discontiguously, but that can happen in hash indexes.)
1455 : */
2404 andres 1456 LBC 0 : if (nblocks < ((BlockNumber) RELSEG_SIZE))
5940 tgl 1457 ECB : {
1 tmunro 1458 UNC 0 : char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1459 : MCXT_ALLOC_ZERO);
1460 :
2404 andres 1461 UIC 0 : mdextend(reln, forknum,
1462 0 : nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1463 : zerobuf, skipFsync);
1464 0 : pfree(zerobuf);
1465 : }
1466 0 : flags = O_CREAT;
1467 : }
2404 andres 1468 GBC 10 : else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
2404 andres 1469 EUB : nblocks < ((BlockNumber) RELSEG_SIZE))
1470 : {
1471 : /*
1472 : * When not extending (or explicitly including truncated
2404 andres 1473 ECB : * segments), only open the next segment if the current one is
1474 : * exactly RELSEG_SIZE. If not (this branch), either return NULL
1475 : * or fail.
1476 : */
2404 andres 1477 GIC 10 : if (behavior & EXTENSION_RETURN_NULL)
1478 : {
1479 : /*
1480 : * Some callers discern between reasons for _mdfd_getseg()
2404 andres 1481 ECB : * returning NULL based on errno. As there's no failing
1482 : * syscall involved in this case, explicitly set errno to
1483 : * ENOENT, as that seems the closest interpretation.
1484 : */
2404 andres 1485 UIC 0 : errno = ENOENT;
2404 andres 1486 LBC 0 : return NULL;
1487 : }
2539 andres 1488 ECB :
2404 andres 1489 GIC 10 : ereport(ERROR,
2404 andres 1490 ECB : (errcode_for_file_access(),
1491 : errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1492 : _mdfd_segpath(reln, forknum, nextsegno),
1493 : blkno, nblocks)));
2404 andres 1494 EUB : }
1495 :
2404 andres 1496 LBC 0 : v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1497 :
2404 andres 1498 UIC 0 : if (v == NULL)
1499 : {
1500 0 : if ((behavior & EXTENSION_RETURN_NULL) &&
1501 0 : FILE_POSSIBLY_DELETED(errno))
1502 0 : return NULL;
1503 0 : ereport(ERROR,
2404 andres 1504 ECB : (errcode_for_file_access(),
1505 : errmsg("could not open file \"%s\" (target block %u): %m",
1506 : _mdfd_segpath(reln, forknum, nextsegno),
1507 : blkno)));
1508 : }
1509 : }
1510 :
8986 bruce 1511 CBC 248621 : return v;
1512 : }
1513 :
8399 tgl 1514 ECB : /*
1515 : * Get number of blocks present in a single disk file
8400 1516 : */
1517 : static BlockNumber
5354 heikki.linnakangas 1518 CBC 4252034 : _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
9770 scrappy 1519 ECB : {
1520 : off_t len;
1521 :
1614 tmunro 1522 GIC 4252034 : len = FileSize(seg->mdfd_vfd);
8397 bruce 1523 4252034 : if (len < 0)
5940 tgl 1524 UIC 0 : ereport(ERROR,
5940 tgl 1525 EUB : (errcode_for_file_access(),
1526 : errmsg("could not seek to end of file \"%s\": %m",
4995 heikki.linnakangas 1527 : FilePathName(seg->mdfd_vfd))));
1528 : /* note that this calculation will ignore any partial block at EOF */
5940 tgl 1529 GIC 4252034 : return (BlockNumber) (len / BLCKSZ);
9770 scrappy 1530 EUB : }
1466 tmunro 1531 :
1532 : /*
1533 : * Sync a file to disk, given a file tag. Write the path into an output
1534 : * buffer so the caller can use it in error messages.
1535 : *
1536 : * Return 0 on success, -1 on failure, with errno set.
1537 : */
1538 : int
1466 tmunro 1539 UIC 0 : mdsyncfiletag(const FileTag *ftag, char *path)
1540 : {
277 rhaas 1541 UNC 0 : SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
1542 : File file;
1543 : instr_time io_start;
1544 : bool need_to_close;
1545 : int result,
1546 : save_errno;
1547 :
1548 : /* See if we already have the file open, or need to open it. */
1212 tmunro 1549 UIC 0 : if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1212 tmunro 1550 ECB : {
1212 tmunro 1551 UIC 0 : file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1552 0 : strlcpy(path, FilePathName(file), MAXPGPATH);
1553 0 : need_to_close = false;
1554 : }
1555 : else
1556 : {
1557 : char *p;
1212 tmunro 1558 ECB :
1212 tmunro 1559 UIC 0 : p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1560 0 : strlcpy(path, p, MAXPGPATH);
1561 0 : pfree(p);
1212 tmunro 1562 ECB :
1 tmunro 1563 UNC 0 : file = PathNameOpenFile(path, _mdfd_open_flags());
1212 tmunro 1564 UIC 0 : if (file < 0)
1212 tmunro 1565 LBC 0 : return -1;
1212 tmunro 1566 UIC 0 : need_to_close = true;
1212 tmunro 1567 ECB : }
1568 :
2 andres 1569 UNC 0 : io_start = pgstat_prepare_io_time();
1570 :
1571 : /* Sync the file. */
1212 tmunro 1572 UIC 0 : result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1573 0 : save_errno = errno;
1212 tmunro 1574 ECB :
1212 tmunro 1575 LBC 0 : if (need_to_close)
1212 tmunro 1576 UIC 0 : FileClose(file);
1577 :
2 andres 1578 UNC 0 : pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1579 : IOOP_FSYNC, io_start, 1);
1580 :
1212 tmunro 1581 UIC 0 : errno = save_errno;
1582 0 : return result;
1583 : }
1584 :
1585 : /*
1586 : * Unlink a file, given a file tag. Write the path into an output
1466 tmunro 1587 ECB : * buffer so the caller can use it in error messages.
1588 : *
1589 : * Return 0 on success, -1 on failure, with errno set.
1590 : */
1591 : int
1466 tmunro 1592 CBC 27428 : mdunlinkfiletag(const FileTag *ftag, char *path)
1466 tmunro 1593 EUB : {
1594 : char *p;
1595 :
1466 tmunro 1596 ECB : /* Compute the path. */
277 rhaas 1597 GNC 27428 : p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1466 tmunro 1598 GIC 27428 : strlcpy(path, p, MAXPGPATH);
1466 tmunro 1599 CBC 27428 : pfree(p);
1466 tmunro 1600 ECB :
1601 : /* Try to unlink the file. */
1466 tmunro 1602 CBC 27428 : return unlink(path);
1603 : }
1466 tmunro 1604 ECB :
1466 tmunro 1605 EUB : /*
1606 : * Check if a given candidate request matches a given tag, when processing
1466 tmunro 1607 ECB : * a SYNC_FILTER_REQUEST request. This will be called for all pending
1608 : * requests to find out whether to forget them.
1609 : */
1610 : bool
1466 tmunro 1611 GIC 4329 : mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1612 : {
1613 : /*
1614 : * For now we only use filter requests as a way to drop all scheduled
1615 : * callbacks relating to a given database, when dropping the database.
1616 : * We'll return true for all candidates that have the same database OID as
1617 : * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1618 : */
277 rhaas 1619 GNC 4329 : return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1620 : }
|