Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "portability/mem.h"
97 : #include "postmaster/startup.h"
98 : #include "storage/fd.h"
99 : #include "storage/ipc.h"
100 : #include "utils/guc.h"
101 : #include "utils/guc_hooks.h"
102 : #include "utils/resowner_private.h"
103 : #include "utils/varlena.h"
104 :
105 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : #if defined(HAVE_SYNC_FILE_RANGE)
107 : #define PG_FLUSH_DATA_WORKS 1
108 : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : #define PG_FLUSH_DATA_WORKS 1
110 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : #define PG_FLUSH_DATA_WORKS 1
112 : #endif
113 :
114 : /*
115 : * We must leave some file descriptors free for system(), the dynamic loader,
116 : * and other code that tries to open files without consulting fd.c. This
117 : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : * errors, there's never any guarantee that we won't get ENFILE due to
119 : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : *
122 : * Because this is just a fixed setting, we are effectively assuming that
123 : * no such code will leave FDs open over the long term; otherwise the slop
124 : * is likely to be insufficient. Note in particular that we expect that
125 : * loading a shared library does not result in any permanent increase in
126 : * the number of open files. (This appears to be true on most if not
127 : * all platforms as of Feb 2004.)
128 : */
129 : #define NUM_RESERVED_FDS 10
130 :
131 : /*
132 : * If we have fewer than this many usable FDs after allowing for the reserved
133 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : * much less than that. Note that this value ensures numExternalFDs can be
135 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : * will not pass unless that can grow to at least 14.)
137 : */
138 : #define FD_MINFREE 48
139 :
140 : /*
141 : * A number of platforms allow individual processes to open many more files
142 : * than they can really support when *many* processes do the same thing.
143 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : * what the postmaster's initial probe suggests will work.
145 : */
146 : int max_files_per_process = 1000;
147 :
148 : /*
149 : * Maximum number of file descriptors to open for operations that fd.c knows
150 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : * calls set_max_safe_fds() late in initialization to update the value, and
154 : * that value is then inherited by forked subprocesses.
155 : *
156 : * Note: the value of max_files_per_process is taken into account while
157 : * setting this variable, and so need not be tested separately.
158 : */
159 : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 :
161 : /* Whether it is safe to continue running after fsync() fails. */
162 : bool data_sync_retry = false;
163 :
164 : /* How SyncDataDirectory() should do its job. */
165 : int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
166 :
167 : /* Which kinds of files should be opened with PG_O_DIRECT. */
168 : int io_direct_flags;
169 :
170 : /* Debugging.... */
171 :
172 : #ifdef FDDEBUG
173 : #define DO_DB(A) \
174 : do { \
175 : int _do_db_save_errno = errno; \
176 : A; \
177 : errno = _do_db_save_errno; \
178 : } while (0)
179 : #else
180 : #define DO_DB(A) \
181 : ((void) 0)
182 : #endif
183 :
184 : #define VFD_CLOSED (-1)
185 :
186 : #define FileIsValid(file) \
187 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 :
189 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 :
191 : /* these are the assigned bits in fdstate below: */
192 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 :
196 : typedef struct vfd
197 : {
198 : int fd; /* current FD, or VFD_CLOSED if none */
199 : unsigned short fdstate; /* bitflags for VFD's state */
200 : ResourceOwner resowner; /* owner, for automatic cleanup */
201 : File nextFree; /* link to next free VFD, if in freelist */
202 : File lruMoreRecently; /* doubly linked recency-of-use list */
203 : File lruLessRecently;
204 : off_t fileSize; /* current size of file (0 if not temporary) */
205 : char *fileName; /* name of file, or NULL for unused VFD */
206 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 : int fileFlags; /* open(2) flags for (re)opening the file */
208 : mode_t fileMode; /* mode to pass to open(2) */
209 : } Vfd;
210 :
211 : /*
212 : * Virtual File Descriptor array pointer and size. This grows as
213 : * needed. 'File' values are indexes into this array.
214 : * Note that VfdCache[0] is not a usable VFD, just a list header.
215 : */
216 : static Vfd *VfdCache;
217 : static Size SizeVfdCache = 0;
218 :
219 : /*
220 : * Number of file descriptors known to be in use by VFD entries.
221 : */
222 : static int nfile = 0;
223 :
224 : /*
225 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 : * to close
227 : */
228 : static bool have_xact_temporary_files = false;
229 :
230 : /*
231 : * Tracks the total size of all temporary files. Note: when temp_file_limit
232 : * is being enforced, this cannot overflow since the limit cannot be more
233 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 : * overflow, but we don't care.
235 : */
236 : static uint64 temporary_files_size = 0;
237 :
238 : /* Temporary file access initialized and not yet shut down? */
239 : #ifdef USE_ASSERT_CHECKING
240 : static bool temporary_files_allowed = false;
241 : #endif
242 :
243 : /*
244 : * List of OS handles opened with AllocateFile, AllocateDir and
245 : * OpenTransientFile.
246 : */
247 : typedef enum
248 : {
249 : AllocateDescFile,
250 : AllocateDescPipe,
251 : AllocateDescDir,
252 : AllocateDescRawFD
253 : } AllocateDescKind;
254 :
255 : typedef struct
256 : {
257 : AllocateDescKind kind;
258 : SubTransactionId create_subid;
259 : union
260 : {
261 : FILE *file;
262 : DIR *dir;
263 : int fd;
264 : } desc;
265 : } AllocateDesc;
266 :
267 : static int numAllocatedDescs = 0;
268 : static int maxAllocatedDescs = 0;
269 : static AllocateDesc *allocatedDescs = NULL;
270 :
271 : /*
272 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 : */
274 : static int numExternalFDs = 0;
275 :
276 : /*
277 : * Number of temporary files opened during the current session;
278 : * this is used in generation of tempfile names.
279 : */
280 : static long tempFileCounter = 0;
281 :
282 : /*
283 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 : * indicating that the current database's default tablespace should be used.)
285 : * When numTempTableSpaces is -1, this has not been set in the current
286 : * transaction.
287 : */
288 : static Oid *tempTableSpaces = NULL;
289 : static int numTempTableSpaces = -1;
290 : static int nextTempTableSpace = 0;
291 :
292 :
293 : /*--------------------
294 : *
295 : * Private Routines
296 : *
297 : * Delete - delete a file from the Lru ring
298 : * LruDelete - remove a file from the Lru ring and close its FD
299 : * Insert - put a file at the front of the Lru ring
300 : * LruInsert - put a file at the front of the Lru ring and open it
301 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 : * FreeVfd - free a file record
305 : *
306 : * The Least Recently Used ring is a doubly linked list that begins and
307 : * ends on element zero. Element zero is special -- it doesn't represent
308 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 : * anchor that shows us the beginning/end of the ring.
310 : * Only VFD elements that are currently really open (have an FD assigned) are
311 : * in the Lru ring. Elements that are "virtually" open can be recognized
312 : * by having a non-null fileName field.
313 : *
314 : * example:
315 : *
316 : * /--less----\ /---------\
317 : * v \ v \
318 : * #0 --more---> LeastRecentlyUsed --more-\ \
319 : * ^\ | |
320 : * \\less--> MostRecentlyUsedFile <---/ |
321 : * \more---/ \--less--/
322 : *
323 : *--------------------
324 : */
325 : static void Delete(File file);
326 : static void LruDelete(File file);
327 : static void Insert(File file);
328 : static int LruInsert(File file);
329 : static bool ReleaseLruFile(void);
330 : static void ReleaseLruFiles(void);
331 : static File AllocateVfd(void);
332 : static void FreeVfd(File file);
333 :
334 : static int FileAccess(File file);
335 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 : static bool reserveAllocatedDesc(void);
337 : static int FreeDesc(AllocateDesc *desc);
338 :
339 : static void BeforeShmemExit_Files(int code, Datum arg);
340 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 : static void RemovePgTempRelationFiles(const char *tsdirname);
342 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 :
344 : static void walkdir(const char *path,
345 : void (*action) (const char *fname, bool isdir, int elevel),
346 : bool process_symlinks,
347 : int elevel);
348 : #ifdef PG_FLUSH_DATA_WORKS
349 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 : #endif
351 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 :
354 : static int fsync_parent_path(const char *fname, int elevel);
355 :
356 :
357 : /*
358 : * pg_fsync --- do fsync with or without writethrough
359 : */
360 : int
8157 tgl 361 GIC 117698 : pg_fsync(int fd)
362 : {
1230 michael 363 ECB : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
364 : struct stat st;
365 :
366 : /*
367 : * Some operating system implementations of fsync() have requirements
368 : * about the file access modes that were used when their file descriptor
369 : * argument was opened, and these requirements differ depending on whether
370 : * the file descriptor is for a directory.
371 : *
372 : * For any file descriptor that may eventually be handed to fsync(), we
373 : * should have opened it with access modes that are compatible with
374 : * fsync() on all supported systems, otherwise the code may not be
375 : * portable, even if it runs ok on the current system.
376 : *
377 : * We assert here that a descriptor for a file was opened with write
378 : * permissions (either O_RDWR or O_WRONLY) and for a directory without
379 : * write permissions (O_RDONLY).
380 : *
381 : * Ignore any fstat errors and let the follow-up fsync() do its work.
382 : * Doing this sanity check here counts for the case where fsync() is
383 : * disabled.
384 : */
1230 michael 385 GIC 117698 : if (fstat(fd, &st) == 0)
386 : {
1230 michael 387 CBC 117698 : int desc_flags = fcntl(fd, F_GETFL);
388 :
1230 michael 389 ECB : /*
390 : * O_RDONLY is historically 0, so just make sure that for directories
391 : * no write flags are used.
392 : */
1230 michael 393 GIC 117698 : if (S_ISDIR(st.st_mode))
394 22905 : Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
1230 michael 395 ECB : else
1230 michael 396 CBC 94793 : Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
397 : }
398 117698 : errno = 0;
399 : #endif
1230 michael 400 ECB :
401 : /* #if is to skip the sync_method test if there's no need for it */
402 : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
403 : if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
404 : return pg_fsync_writethrough(fd);
405 : else
406 : #endif
4505 tgl 407 GIC 117698 : return pg_fsync_no_writethrough(fd);
408 : }
6533 bruce 409 ECB :
410 :
411 : /*
412 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
413 : * enableFsync is off
414 : */
415 : int
6533 bruce 416 GIC 117698 : pg_fsync_no_writethrough(int fd)
417 : {
8157 tgl 418 CBC 117698 : if (enableFsync)
8157 tgl 419 UIC 0 : return fsync(fd);
8157 tgl 420 ECB : else
8157 tgl 421 GBC 117698 : return 0;
422 : }
8157 tgl 423 ECB :
424 : /*
425 : * pg_fsync_writethrough
426 : */
427 : int
6533 bruce 428 UIC 0 : pg_fsync_writethrough(int fd)
429 : {
6533 bruce 430 UBC 0 : if (enableFsync)
431 : {
6533 bruce 432 EUB : #ifdef WIN32
433 : return _commit(fd);
434 : #elif defined(F_FULLFSYNC)
435 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
436 : #else
4794 tgl 437 UIC 0 : errno = ENOSYS;
6533 bruce 438 0 : return -1;
6533 bruce 439 EUB : #endif
6291 tgl 440 : }
441 : else
6533 bruce 442 UIC 0 : return 0;
443 : }
6533 bruce 444 EUB :
445 : /*
446 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
447 : */
448 : int
8085 tgl 449 UBC 0 : pg_fdatasync(int fd)
450 : {
451 0 : if (enableFsync)
452 0 : return fdatasync(fd);
453 : else
8085 tgl 454 UIC 0 : return 0;
455 : }
456 :
4801 stark 457 ECB : /*
458 : * pg_flush_data --- advise OS that the described dirty data should be flushed
459 : *
460 : * offset of 0 with nbytes 0 means that the entire file should be flushed
461 : */
462 : void
2606 andres 463 GIC 287494 : pg_flush_data(int fd, off_t offset, off_t nbytes)
464 : {
2606 andres 465 ECB : /*
466 : * Right now file flushing is primarily used to avoid making later
467 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
468 : * if fsyncs are disabled - that's a decision we might want to make
469 : * configurable at some point.
470 : */
2606 andres 471 GIC 287494 : if (!enableFsync)
472 287494 : return;
473 :
474 : /*
475 : * We compile all alternatives that are supported on the current platform,
476 : * to find portability problems more easily.
2606 andres 477 EUB : */
3922 tgl 478 : #if defined(HAVE_SYNC_FILE_RANGE)
479 : {
480 : int rc;
481 : static bool not_implemented_by_kernel = false;
482 :
1505 tmunro 483 UIC 0 : if (not_implemented_by_kernel)
484 0 : return;
485 :
486 : /*
487 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
488 : * tells the OS that writeback for the specified blocks should be
2606 andres 489 EUB : * started, but that we don't want to wait for completion. Note that
490 : * this call might block if too much dirty data exists in the range.
2552 tgl 491 : * This is the preferable method on OSs supporting it, as it works
492 : * reliably when available (contrast to msync()) and doesn't flush out
493 : * clean data (like FADV_DONTNEED).
494 : */
2606 andres 495 UIC 0 : rc = sync_file_range(fd, offset, nbytes,
496 : SYNC_FILE_RANGE_WRITE);
497 0 : if (rc != 0)
498 : {
499 : int elevel;
1505 tmunro 500 EUB :
501 : /*
502 : * For systems that don't have an implementation of
503 : * sync_file_range() such as Windows WSL, generate only one
504 : * warning and then suppress all further attempts by this process.
505 : */
1505 tmunro 506 UBC 0 : if (errno == ENOSYS)
507 : {
508 0 : elevel = WARNING;
1505 tmunro 509 UIC 0 : not_implemented_by_kernel = true;
510 : }
511 : else
512 0 : elevel = data_sync_elevel(WARNING);
1505 tmunro 513 EUB :
1505 tmunro 514 UIC 0 : ereport(elevel,
515 : (errcode_for_file_access(),
516 : errmsg("could not flush dirty data: %m")));
517 : }
518 :
2606 andres 519 0 : return;
520 : }
521 : #endif
522 : #if !defined(WIN32) && defined(MS_ASYNC)
523 : {
524 : void *p;
525 : static int pagesize = 0;
526 :
527 : /*
528 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
529 : * writeback. On linux it only does so if MS_SYNC is specified, but
530 : * then it does the writeback synchronously. Luckily all common linux
531 : * systems have sync_file_range(). This is preferable over
532 : * FADV_DONTNEED because it doesn't flush out clean data.
533 : *
534 : * We map the file (mmap()), tell the kernel to sync back the contents
535 : * (msync()), and then remove the mapping again (munmap()).
536 : */
537 :
538 : /* mmap() needs actual length if we want to map whole file */
539 : if (offset == 0 && nbytes == 0)
540 : {
541 : nbytes = lseek(fd, 0, SEEK_END);
542 : if (nbytes < 0)
543 : {
544 : ereport(WARNING,
545 : (errcode_for_file_access(),
546 : errmsg("could not determine dirty data size: %m")));
547 : return;
548 : }
549 : }
550 :
551 : /*
552 : * Some platforms reject partial-page mmap() attempts. To deal with
553 : * that, just truncate the request to a page boundary. If any extra
554 : * bytes don't get flushed, well, it's only a hint anyway.
555 : */
556 :
557 : /* fetch pagesize only once */
558 : if (pagesize == 0)
559 : pagesize = sysconf(_SC_PAGESIZE);
560 :
561 : /* align length to pagesize, dropping any fractional page */
562 : if (pagesize > 0)
563 : nbytes = (nbytes / pagesize) * pagesize;
564 :
565 : /* fractional-page request is a no-op */
566 : if (nbytes <= 0)
567 : return;
568 :
569 : /*
570 : * mmap could well fail, particularly on 32-bit platforms where there
571 : * may simply not be enough address space. If so, silently fall
572 : * through to the next implementation.
573 : */
574 : if (nbytes <= (off_t) SSIZE_MAX)
575 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
576 : else
577 : p = MAP_FAILED;
578 :
579 : if (p != MAP_FAILED)
580 : {
581 : int rc;
582 :
583 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
584 : if (rc != 0)
585 : {
586 : ereport(data_sync_elevel(WARNING),
587 : (errcode_for_file_access(),
588 : errmsg("could not flush dirty data: %m")));
589 : /* NB: need to fall through to munmap()! */
590 : }
591 :
592 : rc = munmap(p, (size_t) nbytes);
593 : if (rc != 0)
594 : {
595 : /* FATAL error because mapping would remain */
596 : ereport(FATAL,
597 : (errcode_for_file_access(),
598 : errmsg("could not munmap() while flushing data: %m")));
599 : }
600 :
601 : return;
602 : }
603 : }
604 : #endif
605 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
606 : {
607 : int rc;
608 :
609 : /*
610 : * Signal the kernel that the passed in range should not be cached
611 : * anymore. This has the, desired, side effect of writing out dirty
612 : * data, and the, undesired, side effect of likely discarding useful
613 : * clean cached blocks. For the latter reason this is the least
614 : * preferable method.
615 : */
616 :
617 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
618 :
619 : if (rc != 0)
620 : {
621 : /* don't error out, this is just a performance optimization */
622 : ereport(WARNING,
623 : (errcode_for_file_access(),
624 : errmsg("could not flush dirty data: %m")));
625 : }
626 :
627 : return;
628 : }
629 : #endif
4801 stark 630 ECB : }
631 :
632 : /*
633 : * Truncate a file to a given length by name.
634 : */
635 : int
859 tmunro 636 GIC 174303 : pg_truncate(const char *path, off_t length)
637 : {
638 : #ifdef WIN32
639 : int save_errno;
640 : int ret;
641 : int fd;
642 :
643 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
644 : if (fd >= 0)
645 : {
646 : ret = ftruncate(fd, length);
647 : save_errno = errno;
648 : CloseTransientFile(fd);
649 : errno = save_errno;
859 tmunro 650 ECB : }
651 : else
652 : ret = -1;
653 :
654 : return ret;
655 : #else
859 tmunro 656 GIC 174303 : return truncate(path, length);
657 : #endif
658 : }
659 :
660 : /*
3504 rhaas 661 ECB : * fsync_fname -- fsync a file or directory, handling errors properly
662 : *
663 : * Try to fsync a file or directory. When doing the latter, ignore errors that
664 : * indicate the OS just doesn't allow/require fsyncing directories.
665 : */
666 : void
2587 andres 667 GIC 19861 : fsync_fname(const char *fname, bool isdir)
668 : {
1602 tmunro 669 19861 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
2587 andres 670 19861 : }
671 :
672 : /*
673 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
674 : *
675 : * This routine ensures that, after returning, the effect of renaming file
676 : * persists in case of a crash. A crash while this routine is running will
677 : * leave you with either the pre-existing or the moved file in place of the
678 : * new file; no mixed state or truncated files are possible.
679 : *
680 : * It does so by using fsync on the old filename and the possibly existing
681 : * target filename before the rename, and the target file and directory after.
682 : *
683 : * Note that rename() cannot be used across arbitrary directories, as they
684 : * might not be on the same filesystem. Therefore this routine does not
685 : * support renaming across directories.
686 : *
2587 andres 687 ECB : * Log errors with the caller specified severity.
688 : *
689 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
690 : * valid upon return.
691 : */
692 : int
2587 andres 693 GIC 4718 : durable_rename(const char *oldfile, const char *newfile, int elevel)
694 : {
695 : int fd;
696 :
697 : /*
2587 andres 698 ECB : * First fsync the old and target path (if it exists), to ensure that they
2587 andres 699 EUB : * are properly persistent on disk. Syncing the target file is not
700 : * strictly necessary, but it makes it easier to reason about crashes;
2587 andres 701 ECB : * because it's then guaranteed that either source or target file exists
702 : * after a crash.
703 : */
2587 andres 704 CBC 4718 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
2587 andres 705 UIC 0 : return -1;
2587 andres 706 EUB :
2024 peter_e 707 GIC 4718 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
2587 andres 708 4718 : if (fd < 0)
2587 andres 709 EUB : {
2587 andres 710 GIC 2507 : if (errno != ENOENT)
711 : {
2587 andres 712 UIC 0 : ereport(elevel,
713 : (errcode_for_file_access(),
2587 andres 714 ECB : errmsg("could not open file \"%s\": %m", newfile)));
2587 andres 715 UIC 0 : return -1;
716 : }
717 : }
718 : else
2587 andres 719 EUB : {
2587 andres 720 GBC 2211 : if (pg_fsync(fd) != 0)
2587 andres 721 EUB : {
722 : int save_errno;
723 :
724 : /* close file upon error, might not be in transaction context */
2587 andres 725 UIC 0 : save_errno = errno;
2587 andres 726 UBC 0 : CloseTransientFile(fd);
2587 andres 727 UIC 0 : errno = save_errno;
728 :
2587 andres 729 LBC 0 : ereport(elevel,
730 : (errcode_for_file_access(),
2587 andres 731 EUB : errmsg("could not fsync file \"%s\": %m", newfile)));
2587 andres 732 UIC 0 : return -1;
733 : }
1492 michael 734 EUB :
1373 peter 735 GIC 2211 : if (CloseTransientFile(fd) != 0)
736 : {
1492 michael 737 UIC 0 : ereport(elevel,
738 : (errcode_for_file_access(),
1492 michael 739 ECB : errmsg("could not close file \"%s\": %m", newfile)));
1492 michael 740 UIC 0 : return -1;
1492 michael 741 EUB : }
742 : }
743 :
744 : /* Time to do the real deal... */
2587 andres 745 GBC 4718 : if (rename(oldfile, newfile) < 0)
746 : {
2587 andres 747 UIC 0 : ereport(elevel,
748 : (errcode_for_file_access(),
749 : errmsg("could not rename file \"%s\" to \"%s\": %m",
750 : oldfile, newfile)));
751 0 : return -1;
2587 andres 752 ECB : }
3504 rhaas 753 EUB :
754 : /*
2587 andres 755 ECB : * To guarantee renaming the file is persistent, fsync the file with its
2587 andres 756 EUB : * new name, and its containing directory.
757 : */
2587 andres 758 CBC 4718 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
2587 andres 759 UIC 0 : return -1;
760 :
2587 andres 761 GIC 4718 : if (fsync_parent_path(newfile, elevel) != 0)
2587 andres 762 UIC 0 : return -1;
763 :
2587 andres 764 GIC 4718 : return 0;
765 : }
766 :
767 : /*
768 : * durable_unlink -- remove a file in a durable manner
769 : *
770 : * This routine ensures that, after returning, the effect of removing file
771 : * persists in case of a crash. A crash while this routine is running will
772 : * leave the system in no mixed state.
773 : *
774 : * It does so by using fsync on the parent directory of the file after the
775 : * actual removal is done.
776 : *
2204 teodor 777 ECB : * Log errors with the severity specified by caller.
778 : *
779 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
780 : * valid upon return.
781 : */
782 : int
2204 teodor 783 GIC 121 : durable_unlink(const char *fname, int elevel)
784 : {
2204 teodor 785 CBC 121 : if (unlink(fname) < 0)
786 : {
2204 teodor 787 GIC 30 : ereport(elevel,
788 : (errcode_for_file_access(),
789 : errmsg("could not remove file \"%s\": %m",
790 : fname)));
791 30 : return -1;
2204 teodor 792 ECB : }
2204 teodor 793 EUB :
794 : /*
2153 bruce 795 ECB : * To guarantee that the removal of the file is persistent, fsync its
796 : * parent directory.
797 : */
2204 teodor 798 GIC 91 : if (fsync_parent_path(fname, elevel) != 0)
2204 teodor 799 UIC 0 : return -1;
800 :
2204 teodor 801 GIC 91 : return 0;
802 : }
803 :
804 : /*
805 : * InitFileAccess --- initialize this module during backend startup
6453 tgl 806 ECB : *
807 : * This is called during either normal or standalone backend start.
808 : * It is *not* called in the postmaster.
809 : *
610 andres 810 : * Note that this does not initialize temporary file access, that is
811 : * separately initialized via InitTemporaryFileAccess().
812 : */
813 : void
6453 tgl 814 GIC 13291 : InitFileAccess(void)
815 : {
6385 bruce 816 13291 : Assert(SizeVfdCache == 0); /* call me only once */
817 :
818 : /* initialize cache header entry */
6453 tgl 819 CBC 13291 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
820 13291 : if (VfdCache == NULL)
6453 tgl 821 UIC 0 : ereport(FATAL,
822 : (errcode(ERRCODE_OUT_OF_MEMORY),
6453 tgl 823 ECB : errmsg("out of memory")));
824 :
6453 tgl 825 GBC 106328 : MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
6453 tgl 826 GIC 13291 : VfdCache->fd = VFD_CLOSED;
827 :
828 13291 : SizeVfdCache = 1;
610 andres 829 13291 : }
610 andres 830 ECB :
831 : /*
832 : * InitTemporaryFileAccess --- initialize temporary file access during startup
833 : *
834 : * This is called during either normal or standalone backend start.
835 : * It is *not* called in the postmaster.
836 : *
837 : * This is separate from InitFileAccess() because temporary file cleanup can
838 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
839 : * our reporting has to happen before that. Low level file access should be
610 andres 840 EUB : * available for longer, hence the separate initialization / shutdown of
841 : * temporary file handling.
842 : */
610 andres 843 ECB : void
610 andres 844 CBC 13291 : InitTemporaryFileAccess(void)
845 : {
360 drowley 846 GIC 13291 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
610 andres 847 GBC 13291 : Assert(!temporary_files_allowed); /* call me only once */
610 andres 848 EUB :
849 : /*
850 : * Register before-shmem-exit hook to ensure temp files are dropped while
851 : * we can still report stats.
610 andres 852 ECB : */
610 andres 853 GIC 13291 : before_shmem_exit(BeforeShmemExit_Files, 0);
6453 tgl 854 EUB :
610 andres 855 : #ifdef USE_ASSERT_CHECKING
610 andres 856 GIC 13291 : temporary_files_allowed = true;
610 andres 857 ECB : #endif
6453 tgl 858 GIC 13291 : }
6453 tgl 859 ECB :
6985 860 : /*
861 : * count_usable_fds --- count how many FDs the system will let us open,
862 : * and estimate how many are already open.
863 : *
864 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
865 : * value of max_to_probe might result in an underestimate of already_open;
866 : * we must fill in any "gaps" in the set of used FDs before the calculation
3260 bruce 867 : * of already_open will give the right answer. In practice, max_to_probe
6454 tgl 868 : * of a couple of dozen should be enough to ensure good results.
869 : *
584 870 : * We assume stderr (FD 2) is available for dup'ing. While the calling
871 : * script could theoretically close that, it would be a really bad idea,
872 : * since then one risks loss of error messages from, e.g., libc.
873 : */
874 : static void
6454 tgl 875 GIC 593 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
876 : {
6985 tgl 877 ECB : int *fd;
878 : int size;
6985 tgl 879 CBC 593 : int used = 0;
6985 tgl 880 GIC 593 : int highestfd = 0;
881 : int j;
882 :
883 : #ifdef HAVE_GETRLIMIT
884 : struct rlimit rlim;
885 : int getrlimit_status;
5149 peter_e 886 ECB : #endif
887 :
6985 tgl 888 GIC 593 : size = 1024;
889 593 : fd = (int *) palloc(size * sizeof(int));
890 :
891 : #ifdef HAVE_GETRLIMIT
5149 peter_e 892 593 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
893 593 : if (getrlimit_status != 0)
5149 peter_e 894 UIC 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
2118 tgl 895 ECB : #endif /* HAVE_GETRLIMIT */
896 :
897 : /* dup until failure or probe limit reached */
6985 898 : for (;;)
6985 tgl 899 GIC 592407 : {
900 : int thisfd;
901 :
902 : #ifdef HAVE_GETRLIMIT
5050 bruce 903 ECB :
904 : /*
905 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
906 : * some platforms
907 : */
5149 peter_e 908 CBC 593000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
5149 peter_e 909 UBC 0 : break;
910 : #endif
911 :
584 tgl 912 GIC 593000 : thisfd = dup(2);
6985 913 593000 : if (thisfd < 0)
914 : {
915 : /* Expect EMFILE or ENFILE, else it's fishy */
6985 tgl 916 LBC 0 : if (errno != EMFILE && errno != ENFILE)
584 tgl 917 UIC 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
6985 tgl 918 LBC 0 : break;
919 : }
920 :
6985 tgl 921 GIC 593000 : if (used >= size)
922 : {
6985 tgl 923 UIC 0 : size *= 2;
924 0 : fd = (int *) repalloc(fd, size * sizeof(int));
6985 tgl 925 ECB : }
6985 tgl 926 GIC 593000 : fd[used++] = thisfd;
6985 tgl 927 ECB :
6985 tgl 928 GIC 593000 : if (highestfd < thisfd)
929 593000 : highestfd = thisfd;
930 :
6454 931 593000 : if (used >= max_to_probe)
932 593 : break;
933 : }
934 :
935 : /* release the files we opened */
6985 936 593593 : for (j = 0; j < used; j++)
937 593000 : close(fd[j]);
938 :
939 593 : pfree(fd);
940 :
941 : /*
942 : * Return results. usable_fds is just the number of successful dups. We
943 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
944 : * number) and so already_open is highestfd+1 - usable_fds.
945 : */
946 593 : *usable_fds = used;
6797 bruce 947 CBC 593 : *already_open = highestfd + 1 - used;
6985 tgl 948 GIC 593 : }
949 :
950 : /*
6985 tgl 951 ECB : * set_max_safe_fds
952 : * Determine number of file descriptors that fd.c is allowed to use
953 : */
954 : void
6985 tgl 955 GIC 593 : set_max_safe_fds(void)
956 : {
957 : int usable_fds;
958 : int already_open;
959 :
960 : /*----------
961 : * We want to set max_safe_fds to
962 : * MIN(usable_fds, max_files_per_process - already_open)
963 : * less the slop factor for files that are opened without consulting
964 : * fd.c. This ensures that we won't exceed either max_files_per_process
965 : * or the experimentally-determined EMFILE limit.
966 : *----------
967 : */
6454 968 593 : count_usable_fds(max_files_per_process,
969 : &usable_fds, &already_open);
970 :
6985 971 593 : max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
6985 tgl 972 ECB :
973 : /*
974 : * Take off the FDs reserved for system() etc.
975 : */
6985 tgl 976 GIC 593 : max_safe_fds -= NUM_RESERVED_FDS;
977 :
978 : /*
979 : * Make sure we still have enough to get by.
980 : */
981 593 : if (max_safe_fds < FD_MINFREE)
6985 tgl 982 UIC 0 : ereport(FATAL,
983 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
984 : errmsg("insufficient file descriptors available to start server process"),
985 : errdetail("System allows %d, server needs at least %d.",
986 : max_safe_fds + NUM_RESERVED_FDS,
987 : FD_MINFREE + NUM_RESERVED_FDS)));
988 :
6985 tgl 989 GIC 593 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
990 : max_safe_fds, usable_fds, already_open);
6985 tgl 991 CBC 593 : }
992 :
993 : /*
2024 peter_e 994 ECB : * Open a file with BasicOpenFilePerm() and pass default file mode for the
995 : * fileMode parameter.
2024 peter_e 996 EUB : */
997 : int
2024 peter_e 998 GBC 24767 : BasicOpenFile(const char *fileName, int fileFlags)
999 : {
1828 sfrost 1000 GIC 24767 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
2024 peter_e 1001 EUB : }
1002 :
1003 : /*
1004 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1005 : *
1006 : * This is exported for use by places that really want a plain kernel FD,
8346 tgl 1007 ECB : * but need to be proof against running out of FDs. Once an FD has been
1008 : * successfully returned, it is the caller's responsibility to ensure that
1009 : * it will not be leaked on ereport()! Most users should *not* call this
1010 : * routine directly, but instead use the VFD abstraction level, which
1011 : * provides protection against descriptor leaks as well as management of
1012 : * files that need to be open for more than a short period of time.
1013 : *
1014 : * Ideally this should be the *only* direct call of open() in the backend.
1015 : * In practice, the postmaster calls open() directly, and there are some
1016 : * direct open() calls done early in backend startup. Those are OK since
1017 : * this module wouldn't have any open files to close at that point anyway.
1018 : */
1019 : int
2024 peter_e 1020 GIC 2476900 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1021 : {
1022 : int fd;
1023 :
8346 tgl 1024 CBC 2476900 : tryAgain:
1025 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1026 :
1027 : /*
1028 : * The value we defined to stand in for O_DIRECT when simulating it with
1029 : * F_NOCACHE had better not collide with any of the standard flags.
629 tmunro 1030 ECB : */
1031 : StaticAssertStmt((PG_O_DIRECT &
1032 : (O_APPEND |
1033 : O_CLOEXEC |
1034 : O_CREAT |
1035 : O_DSYNC |
1036 : O_EXCL |
629 tmunro 1037 EUB : O_RDWR |
1038 : O_RDONLY |
1039 : O_SYNC |
1040 : O_TRUNC |
1041 : O_WRONLY)) == 0,
1042 : "PG_O_DIRECT value collides with standard flag");
1043 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1044 : #else
8346 tgl 1045 GIC 2476900 : fd = open(fileName, fileFlags, fileMode);
1046 : #endif
1047 :
1048 2476900 : if (fd >= 0)
1049 : {
1050 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1051 : if (fileFlags & PG_O_DIRECT)
629 tmunro 1052 ECB : {
1053 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1054 : {
1055 : int save_errno = errno;
1056 :
1057 : close(fd);
1058 : errno = save_errno;
1059 : return -1;
1060 : }
1061 : }
1062 : #endif
1063 :
8346 tgl 1064 GIC 1998010 : return fd; /* success! */
1065 : }
1066 :
8260 1067 478890 : if (errno == EMFILE || errno == ENFILE)
1068 : {
8053 bruce 1069 UIC 0 : int save_errno = errno;
8260 tgl 1070 ECB :
7199 tgl 1071 UIC 0 : ereport(LOG,
7199 tgl 1072 ECB : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
6385 bruce 1073 : errmsg("out of file descriptors: %m; release and retry")));
8346 tgl 1074 LBC 0 : errno = 0;
8260 tgl 1075 UIC 0 : if (ReleaseLruFile())
1076 0 : goto tryAgain;
1077 0 : errno = save_errno;
1078 : }
1079 :
8346 tgl 1080 GIC 478890 : return -1; /* failure */
1081 : }
1082 :
1083 : /*
1084 : * AcquireExternalFD - attempt to reserve an external file descriptor
1085 : *
1086 : * This should be used by callers that need to hold a file descriptor open
1087 : * over more than a short interval, but cannot use any of the other facilities
1088 : * provided by this module.
1089 : *
1090 : * The difference between this and the underlying ReserveExternalFD function
1091 : * is that this will report failure (by setting errno and returning false)
1092 : * if "too many" external FDs are already reserved. This should be used in
1093 : * any code where the total number of FDs to be reserved is not predictable
1094 : * and small.
1095 : */
1096 : bool
1140 1097 126279 : AcquireExternalFD(void)
1098 : {
1140 tgl 1099 ECB : /*
1100 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1101 : * "external" FDs.
1102 : */
1140 tgl 1103 CBC 126279 : if (numExternalFDs < max_safe_fds / 3)
1104 : {
1140 tgl 1105 GIC 126279 : ReserveExternalFD();
1106 126279 : return true;
1107 : }
1140 tgl 1108 UIC 0 : errno = EMFILE;
1140 tgl 1109 LBC 0 : return false;
1110 : }
1140 tgl 1111 ECB :
1112 : /*
1113 : * ReserveExternalFD - report external consumption of a file descriptor
1114 : *
1115 : * This should be used by callers that need to hold a file descriptor open
1116 : * over more than a short interval, but cannot use any of the other facilities
1117 : * provided by this module. This just tracks the use of the FD and closes
1118 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1119 : *
1120 : * Call this directly only in code where failure to reserve the FD would be
1121 : * fatal; for example, the WAL-writing code does so, since the alternative is
1122 : * session failure. Also, it's very unwise to do so in code that could
1123 : * consume more than one FD per process.
1124 : *
1125 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1126 : * available, it doesn't matter too much whether this is called before or
1127 : * after actually opening the FD; but doing so beforehand reduces the risk of
1128 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1129 : * caller's responsibility to keep the external-FD count in sync with reality.
1130 : */
1131 : void
1140 tgl 1132 GIC 175848 : ReserveExternalFD(void)
1140 tgl 1133 ECB : {
1140 tgl 1134 EUB : /*
1135 : * Release VFDs if needed to stay safe. Because we do this before
1140 tgl 1136 ECB : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1137 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1138 : */
1140 tgl 1139 GIC 175848 : ReleaseLruFiles();
1140 tgl 1140 ECB :
1140 tgl 1141 CBC 175848 : numExternalFDs++;
1140 tgl 1142 GIC 175848 : }
1143 :
1140 tgl 1144 ECB : /*
1145 : * ReleaseExternalFD - report release of an external file descriptor
1146 : *
1147 : * This is guaranteed not to change errno, so it can be used in failure paths.
1148 : */
1149 : void
1140 tgl 1150 GIC 159582 : ReleaseExternalFD(void)
1151 : {
1152 159582 : Assert(numExternalFDs > 0);
1153 159582 : numExternalFDs--;
1140 tgl 1154 CBC 159582 : }
1155 :
1140 tgl 1156 ECB :
9770 scrappy 1157 : #if defined(FDDEBUG)
8736 tgl 1158 :
9770 scrappy 1159 : static void
1160 : _dump_lru(void)
1161 : {
9344 bruce 1162 : int mru = VfdCache[0].lruLessRecently;
1163 : Vfd *vfdP = &VfdCache[mru];
1164 : char buf[2048];
1165 :
7524 1166 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1167 : while (mru != 0)
1168 : {
1169 : mru = vfdP->lruLessRecently;
9345 1170 : vfdP = &VfdCache[mru];
1171 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1172 : }
1173 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1174 : elog(LOG, "%s", buf);
9770 scrappy 1175 : }
1176 : #endif /* FDDEBUG */
1177 :
1178 : static void
9770 scrappy 1179 GIC 2054304 : Delete(File file)
9770 scrappy 1180 ECB : {
1181 : Vfd *vfdP;
1182 :
8736 tgl 1183 GIC 2054304 : Assert(file != 0);
1184 :
1185 : DO_DB(elog(LOG, "Delete %d (%s)",
1186 : file, VfdCache[file].fileName));
9345 bruce 1187 ECB : DO_DB(_dump_lru());
1188 :
8736 tgl 1189 CBC 2054304 : vfdP = &VfdCache[file];
1190 :
8736 tgl 1191 GIC 2054304 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
8736 tgl 1192 GBC 2054304 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1193 :
1194 : DO_DB(_dump_lru());
9770 scrappy 1195 GIC 2054304 : }
9770 scrappy 1196 ECB :
1197 : static void
9770 scrappy 1198 GIC 264855 : LruDelete(File file)
1199 : {
1200 : Vfd *vfdP;
1201 :
8736 tgl 1202 264855 : Assert(file != 0);
1203 :
7708 bruce 1204 ECB : DO_DB(elog(LOG, "LruDelete %d (%s)",
1205 : file, VfdCache[file].fileName));
9345 1206 :
8736 tgl 1207 GIC 264855 : vfdP = &VfdCache[file];
1208 :
1209 : /*
1210 : * Close the file. We aren't expecting this to fail; if it does, better
1211 : * to leak the FD than to mess up our internal state.
1212 : */
1373 peter 1213 CBC 264855 : if (close(vfdP->fd) != 0)
1602 tmunro 1214 UIC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1215 : "could not close file \"%s\": %m", vfdP->fileName);
8736 tgl 1216 GIC 264855 : vfdP->fd = VFD_CLOSED;
2238 tgl 1217 CBC 264855 : --nfile;
1218 :
1219 : /* delete the vfd record from the LRU ring */
2238 tgl 1220 GIC 264855 : Delete(file);
9770 scrappy 1221 264855 : }
1222 :
9770 scrappy 1223 ECB : static void
9770 scrappy 1224 CBC 2283732 : Insert(File file)
9770 scrappy 1225 ECB : {
1226 : Vfd *vfdP;
9345 bruce 1227 EUB :
8736 tgl 1228 GIC 2283732 : Assert(file != 0);
1229 :
1230 : DO_DB(elog(LOG, "Insert %d (%s)",
1231 : file, VfdCache[file].fileName));
1232 : DO_DB(_dump_lru());
1233 :
9345 bruce 1234 2283732 : vfdP = &VfdCache[file];
9345 bruce 1235 ECB :
9345 bruce 1236 GIC 2283732 : vfdP->lruMoreRecently = 0;
9345 bruce 1237 CBC 2283732 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
9345 bruce 1238 GIC 2283732 : VfdCache[0].lruLessRecently = file;
9345 bruce 1239 CBC 2283732 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
9345 bruce 1240 EUB :
1241 : DO_DB(_dump_lru());
9770 scrappy 1242 CBC 2283732 : }
1243 :
1244 : /* returns 0 on success, -1 on re-open failure (with errno set) */
9770 scrappy 1245 ECB : static int
9345 bruce 1246 GIC 105661 : LruInsert(File file)
1247 : {
1248 : Vfd *vfdP;
1249 :
8736 tgl 1250 105661 : Assert(file != 0);
1251 :
7708 bruce 1252 ECB : DO_DB(elog(LOG, "LruInsert %d (%s)",
1253 : file, VfdCache[file].fileName));
9345 1254 :
9345 bruce 1255 GIC 105661 : vfdP = &VfdCache[file];
1256 :
1257 105661 : if (FileIsNotOpen(file))
1258 : {
1259 : /* Close excess kernel FDs. */
3591 tgl 1260 105661 : ReleaseLruFiles();
9345 bruce 1261 ECB :
1262 : /*
1263 : * The open could still fail for lack of file descriptors, eg due to
6385 1264 : * overall system file table being full. So, be prepared to release
1265 : * another FD if necessary...
1266 : */
2024 peter_e 1267 GIC 105661 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1268 : vfdP->fileMode);
9345 bruce 1269 105661 : if (vfdP->fd < 0)
9345 bruce 1270 ECB : {
2238 tgl 1271 : DO_DB(elog(LOG, "re-open failed: %m"));
3615 tgl 1272 UBC 0 : return -1;
1273 : }
1274 : else
9345 bruce 1275 ECB : {
9345 bruce 1276 GIC 105661 : ++nfile;
1277 : }
1278 : }
1279 :
9345 bruce 1280 ECB : /*
1281 : * put it at the head of the Lru ring
1282 : */
1283 :
9345 bruce 1284 CBC 105661 : Insert(file);
1285 :
8986 1286 105661 : return 0;
9770 scrappy 1287 ECB : }
1288 :
1289 : /*
1290 : * Release one kernel FD by closing the least-recently-used VFD.
1291 : */
8260 tgl 1292 : static bool
8260 tgl 1293 GIC 264791 : ReleaseLruFile(void)
1294 : {
7708 bruce 1295 ECB : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1296 :
8260 tgl 1297 CBC 264791 : if (nfile > 0)
1298 : {
8260 tgl 1299 ECB : /*
1300 : * There are opened files and so there should be at least one used vfd
1301 : * in the ring.
1302 : */
8260 tgl 1303 CBC 264791 : Assert(VfdCache[0].lruMoreRecently != 0);
8260 tgl 1304 GIC 264791 : LruDelete(VfdCache[0].lruMoreRecently);
8260 tgl 1305 CBC 264791 : return true; /* freed a file */
1306 : }
8260 tgl 1307 UIC 0 : return false; /* no files available to free */
1308 : }
1309 :
3591 tgl 1310 ECB : /*
1311 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1312 : * After calling this, it's OK to try to open another file.
1313 : */
1314 : static void
3591 tgl 1315 CBC 2733022 : ReleaseLruFiles(void)
1316 : {
1140 1317 2997813 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
3591 tgl 1318 ECB : {
3591 tgl 1319 CBC 264791 : if (!ReleaseLruFile())
3591 tgl 1320 UIC 0 : break;
1321 : }
3591 tgl 1322 GIC 2733022 : }
3591 tgl 1323 ECB :
1324 : static File
8260 tgl 1325 GIC 1746847 : AllocateVfd(void)
1326 : {
1327 : Index i;
1328 : File file;
1329 :
1330 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1331 :
6385 bruce 1332 1746847 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1333 :
9345 1334 1746847 : if (VfdCache[0].nextFree == 0)
9345 bruce 1335 ECB : {
1336 : /*
6385 1337 : * The free list is empty so it is time to increase the size of the
1338 : * array. We choose to double it each time this happens. However,
6385 bruce 1339 EUB : * there's not much point in starting *real* small.
1340 : */
8720 bruce 1341 CBC 17078 : Size newCacheSize = SizeVfdCache * 2;
1342 : Vfd *newVfdCache;
1343 :
8736 tgl 1344 GIC 17078 : if (newCacheSize < 32)
1345 11791 : newCacheSize = 32;
1346 :
1347 : /*
7199 tgl 1348 ECB : * Be careful not to clobber VfdCache ptr if realloc fails.
8041 1349 : */
8041 tgl 1350 GIC 17078 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1351 17078 : if (newVfdCache == NULL)
7199 tgl 1352 LBC 0 : ereport(ERROR,
1353 : (errcode(ERRCODE_OUT_OF_MEMORY),
1354 : errmsg("out of memory")));
8041 tgl 1355 GIC 17078 : VfdCache = newVfdCache;
1356 :
1357 : /*
1358 : * Initialize the new entries and link them into the free list.
9345 bruce 1359 ECB : */
8736 tgl 1360 GIC 811335 : for (i = SizeVfdCache; i < newCacheSize; i++)
9345 bruce 1361 ECB : {
8736 tgl 1362 GIC 6354056 : MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
9345 bruce 1363 CBC 794257 : VfdCache[i].nextFree = i + 1;
9345 bruce 1364 GIC 794257 : VfdCache[i].fd = VFD_CLOSED;
9345 bruce 1365 ECB : }
8736 tgl 1366 CBC 17078 : VfdCache[newCacheSize - 1].nextFree = 0;
9345 bruce 1367 GIC 17078 : VfdCache[0].nextFree = SizeVfdCache;
1368 :
1369 : /*
9345 bruce 1370 ECB : * Record the new size
1371 : */
8736 tgl 1372 GIC 17078 : SizeVfdCache = newCacheSize;
1373 : }
1374 :
9345 bruce 1375 1746847 : file = VfdCache[0].nextFree;
1376 :
1377 1746847 : VfdCache[0].nextFree = VfdCache[file].nextFree;
9345 bruce 1378 ECB :
9345 bruce 1379 GIC 1746847 : return file;
9770 scrappy 1380 ECB : }
1381 :
1382 : static void
9770 scrappy 1383 GIC 1453862 : FreeVfd(File file)
9770 scrappy 1384 ECB : {
8736 tgl 1385 CBC 1453862 : Vfd *vfdP = &VfdCache[file];
8736 tgl 1386 ECB :
1387 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1388 : file, vfdP->fileName ? vfdP->fileName : ""));
1389 :
8736 tgl 1390 GIC 1453862 : if (vfdP->fileName != NULL)
1391 : {
1392 978713 : free(vfdP->fileName);
1393 978713 : vfdP->fileName = NULL;
1394 : }
8041 1395 1453862 : vfdP->fdstate = 0x0;
1396 :
8736 1397 1453862 : vfdP->nextFree = VfdCache[0].nextFree;
9345 bruce 1398 1453862 : VfdCache[0].nextFree = file;
9770 scrappy 1399 1453862 : }
1400 :
1401 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1402 : static int
1403 3545355 : FileAccess(File file)
1404 : {
1405 : int returnValue;
9345 bruce 1406 ECB :
1407 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1408 : file, VfdCache[file].fileName));
1409 :
1410 : /*
1411 : * Is the file open? If not, open it and put it at the head of the LRU
1412 : * ring (possibly closing the least recently used file to get an FD).
1413 : */
1414 :
9345 bruce 1415 GIC 3545355 : if (FileIsNotOpen(file))
1416 : {
1417 105661 : returnValue = LruInsert(file);
1418 105661 : if (returnValue != 0)
9345 bruce 1419 LBC 0 : return returnValue;
1420 : }
8736 tgl 1421 GIC 3439694 : else if (VfdCache[0].lruLessRecently != file)
1422 : {
1423 : /*
1424 : * We now know that the file is open and that it is not the last one
1425 : * accessed, so we need to move it to the head of the Lru ring.
1426 : */
1427 :
9345 bruce 1428 906373 : Delete(file);
1429 906373 : Insert(file);
1430 : }
9345 bruce 1431 ECB :
8986 bruce 1432 CBC 3545355 : return 0;
9770 scrappy 1433 EUB : }
1434 :
1435 : /*
1436 : * Called whenever a temporary file is deleted to report its size.
1955 andres 1437 ECB : */
1438 : static void
1955 andres 1439 GIC 3222 : ReportTemporaryFileUsage(const char *path, off_t size)
1440 : {
1955 andres 1441 CBC 3222 : pgstat_report_tempfile(size);
1442 :
1955 andres 1443 GIC 3222 : if (log_temp_files >= 0)
1444 : {
1445 874 : if ((size / 1024) >= log_temp_files)
1446 135 : ereport(LOG,
1447 : (errmsg("temporary file: path \"%s\", size %lu",
1448 : path, (unsigned long) size)));
1955 andres 1449 ECB : }
1955 andres 1450 GIC 3222 : }
1955 andres 1451 ECB :
1452 : /*
1453 : * Called to register a temporary file for automatic close.
1454 : * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1455 : * before the file was opened.
1456 : */
1457 : static void
1955 andres 1458 CBC 5110 : RegisterTemporaryFile(File file)
1955 andres 1459 ECB : {
1955 andres 1460 CBC 5110 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1955 andres 1461 GIC 5110 : VfdCache[file].resowner = CurrentResourceOwner;
1955 andres 1462 ECB :
1463 : /* Backup mechanism for closing at end of xact. */
1955 andres 1464 GIC 5110 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1465 5110 : have_xact_temporary_files = true;
1955 andres 1466 CBC 5110 : }
1467 :
9770 scrappy 1468 ECB : /*
9345 bruce 1469 : * Called when we get a shared invalidation message on some relation.
9770 scrappy 1470 : */
9364 bruce 1471 : #ifdef NOT_USED
9770 scrappy 1472 : void
1473 : FileInvalidate(File file)
1474 : {
1475 : Assert(FileIsValid(file));
9345 bruce 1476 : if (!FileIsNotOpen(file))
1477 : LruDelete(file);
1478 : }
1479 : #endif
1480 :
1481 : /*
1482 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1483 : * fileMode parameter.
1484 : */
1485 : File
2024 peter_e 1486 GIC 1746847 : PathNameOpenFile(const char *fileName, int fileFlags)
1487 : {
1828 sfrost 1488 1746847 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1489 : }
1490 :
6488 tgl 1491 ECB : /*
1492 : * open a file in an arbitrary directory
1493 : *
1494 : * NB: if the passed pathname is relative (which it usually is),
1495 : * it will be interpreted relative to the process' working directory
1496 : * (which should always be $PGDATA when this code is running).
1497 : */
1498 : File
2024 peter_e 1499 GIC 1746847 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1500 : {
1501 : char *fnamecopy;
1502 : File file;
9344 bruce 1503 ECB : Vfd *vfdP;
9345 bruce 1504 EUB :
1505 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1506 : fileName, fileFlags, fileMode));
1507 :
1508 : /*
1509 : * We need a malloc'd copy of the file name; fail cleanly if no room.
7199 tgl 1510 ECB : */
7199 tgl 1511 GBC 1746847 : fnamecopy = strdup(fileName);
7199 tgl 1512 GIC 1746847 : if (fnamecopy == NULL)
7199 tgl 1513 UIC 0 : ereport(ERROR,
1514 : (errcode(ERRCODE_OUT_OF_MEMORY),
1515 : errmsg("out of memory")));
1516 :
9345 bruce 1517 GIC 1746847 : file = AllocateVfd();
1518 1746847 : vfdP = &VfdCache[file];
1519 :
1520 : /* Close excess kernel FDs. */
3591 tgl 1521 1746847 : ReleaseLruFiles();
9345 bruce 1522 ECB :
1523 : /*
1524 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1525 : * client shouldn't be expected to know which kernel descriptors are
1526 : * currently open, so it wouldn't make sense for them to be inherited by
1527 : * executed subprograms.
1528 : */
37 tmunro 1529 GNC 1746847 : fileFlags |= O_CLOEXEC;
1530 :
2024 peter_e 1531 GIC 1746847 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1532 :
9345 bruce 1533 1746847 : if (vfdP->fd < 0)
1534 : {
3615 tgl 1535 CBC 475149 : int save_errno = errno;
3615 tgl 1536 ECB :
9345 bruce 1537 GIC 475149 : FreeVfd(file);
7199 tgl 1538 475149 : free(fnamecopy);
3615 1539 475149 : errno = save_errno;
9345 bruce 1540 475149 : return -1;
1541 : }
1542 1271698 : ++nfile;
1543 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1544 : vfdP->fd));
9345 bruce 1545 ECB :
7199 tgl 1546 GIC 1271698 : vfdP->fileName = fnamecopy;
1547 : /* Saved flags are adjusted to be OK for re-opening file */
8041 1548 1271698 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
9345 bruce 1549 1271698 : vfdP->fileMode = fileMode;
4284 tgl 1550 1271698 : vfdP->fileSize = 0;
7551 1551 1271698 : vfdP->fdstate = 0x0;
4875 heikki.linnakangas 1552 1271698 : vfdP->resowner = NULL;
1553 :
874 tgl 1554 1271698 : Insert(file);
1555 :
9345 bruce 1556 1271698 : return file;
1557 : }
1558 :
1559 : /*
1560 : * Create directory 'directory'. If necessary, create 'basedir', which must
1561 : * be the directory above it. This is designed for creating the top-level
1562 : * temporary directory on demand before creating a directory underneath it.
1955 andres 1563 ECB : * Do nothing if the directory already exists.
1564 : *
1565 : * Directories created within the top-level temporary directory should begin
1566 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1567 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1568 : * that do not need any particular prefix.
1569 : */
1570 : void
1955 andres 1571 GIC 171 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1572 : {
1828 sfrost 1573 CBC 171 : if (MakePGDirectory(directory) < 0)
1955 andres 1574 ECB : {
1955 andres 1575 GIC 18 : if (errno == EEXIST)
1576 9 : return;
1577 :
1578 : /*
1579 : * Failed. Try to create basedir first in case it's missing. Tolerate
1580 : * EEXIST to close a race against another process following the same
1581 : * algorithm.
1582 : */
1828 sfrost 1583 9 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1955 andres 1584 UIC 0 : ereport(ERROR,
1955 andres 1585 ECB : (errcode_for_file_access(),
1586 : errmsg("cannot create temporary directory \"%s\": %m",
1587 : basedir)));
1588 :
1589 : /* Try again. */
1828 sfrost 1590 CBC 9 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1955 andres 1591 UIC 0 : ereport(ERROR,
1592 : (errcode_for_file_access(),
1593 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1594 : directory)));
1595 : }
1596 : }
1597 :
1955 andres 1598 ECB : /*
1599 : * Delete a directory and everything in it, if it exists.
1600 : */
1601 : void
1955 andres 1602 GIC 198 : PathNameDeleteTemporaryDir(const char *dirname)
1603 : {
1604 : struct stat statbuf;
1955 andres 1605 ECB :
1606 : /* Silently ignore missing directory. */
1955 andres 1607 GIC 198 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1955 andres 1608 CBC 36 : return;
1955 andres 1609 ECB :
1610 : /*
1611 : * Currently, walkdir doesn't offer a way for our passed in function to
1612 : * maintain state. Perhaps it should, so that we could tell the caller
1613 : * whether this operation succeeded or failed. Since this operation is
1614 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1615 : * just log failures.
1616 : */
1955 andres 1617 GIC 162 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1955 andres 1618 ECB : }
1619 :
1620 : /*
1621 : * Open a temporary file that will disappear when we close it.
1622 : *
1623 : * This routine takes care of generating an appropriate tempfile name.
1624 : * There's no need to pass in fileFlags or fileMode either, since only
8736 tgl 1625 : * one setting makes any sense for a temp file.
7318 bruce 1626 : *
1627 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
4875 heikki.linnakangas 1628 : * to ensure it's closed and deleted when it's no longer needed, typically at
1629 : * the end-of-transaction. In most cases, you don't want temporary files to
1630 : * outlive the transaction that created them, so this should be false -- but
1631 : * if you need "somewhat" temporary storage, this might be useful. In either
1632 : * case, the file is removed when the File is explicitly closed.
1633 : */
1634 : File
5785 tgl 1635 GIC 1783 : OpenTemporaryFile(bool interXact)
8736 tgl 1636 ECB : {
5789 tgl 1637 GIC 1783 : File file = 0;
1638 :
610 andres 1639 1783 : Assert(temporary_files_allowed); /* check temp file access is up */
1640 :
1641 : /*
1642 : * Make sure the current resource owner has space for this File before we
1978 tgl 1643 ECB : * open it, if we'll be registering it below.
1644 : */
1978 tgl 1645 GIC 1783 : if (!interXact)
1646 1777 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1647 :
1648 : /*
5785 tgl 1649 ECB : * If some temp tablespace(s) have been given to us, try to use the next
1650 : * one. If a given tablespace can't be found, we silently fall back to
1651 : * the database's default tablespace.
1652 : *
1653 : * BUT: if the temp file is slated to outlive the current transaction,
1654 : * force it into the database's default tablespace, so that it will not
5624 bruce 1655 : * pose a threat to possible tablespace drop attempts.
1656 : */
5785 tgl 1657 GIC 1783 : if (numTempTableSpaces > 0 && !interXact)
1658 : {
5624 bruce 1659 1 : Oid tblspcOid = GetNextTempTableSpace();
1660 :
5785 tgl 1661 1 : if (OidIsValid(tblspcOid))
5785 tgl 1662 CBC 1 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1663 : }
5789 tgl 1664 ECB :
1665 : /*
1666 : * If not, or if tablespace is bad, create in database's default
1667 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1668 : * here, but just in case it isn't, fall back to pg_default tablespace.
1669 : */
5789 tgl 1670 GIC 1783 : if (file <= 0)
1671 1782 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1672 : MyDatabaseTableSpace :
1673 : DEFAULTTABLESPACE_OID,
5789 tgl 1674 ECB : true);
1675 :
1955 andres 1676 : /* Mark it for deletion at close and temporary file size limit */
1955 andres 1677 GIC 1783 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
5789 tgl 1678 ECB :
4875 heikki.linnakangas 1679 EUB : /* Register it with the current resource owner */
5789 tgl 1680 GIC 1783 : if (!interXact)
1955 andres 1681 1777 : RegisterTemporaryFile(file);
1682 :
5789 tgl 1683 CBC 1783 : return file;
1684 : }
1685 :
1686 : /*
1687 : * Return the path of the temp directory in a given tablespace.
1688 : */
1689 : void
1955 andres 1690 GIC 9649 : TempTablespacePath(char *path, Oid tablespace)
1691 : {
1692 : /*
1693 : * Identify the tempfile directory for this tablespace.
1694 : *
1695 : * If someone tries to specify pg_global, use pg_default instead.
1696 : */
1697 9649 : if (tablespace == InvalidOid ||
1698 1 : tablespace == DEFAULTTABLESPACE_OID ||
1699 : tablespace == GLOBALTABLESPACE_OID)
1955 andres 1700 CBC 9648 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1701 : else
1702 : {
1703 : /* All other tablespaces are accessed via symlinks */
1704 1 : snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1705 : tablespace, TABLESPACE_VERSION_DIRECTORY,
1955 andres 1706 ECB : PG_TEMP_FILES_DIR);
1707 : }
1955 andres 1708 GIC 9649 : }
1709 :
1710 : /*
1711 : * Open a temporary file in a specific tablespace.
1955 andres 1712 ECB : * Subroutine for OpenTemporaryFile, which see for details.
1713 : */
1714 : static File
1955 andres 1715 CBC 1783 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1955 andres 1716 EUB : {
1717 : char tempdirpath[MAXPGPATH];
1718 : char tempfilepath[MAXPGPATH];
1719 : File file;
1720 :
1955 andres 1721 CBC 1783 : TempTablespacePath(tempdirpath, tblspcOid);
1722 :
1723 : /*
1724 : * Generate a tempfile name that should be unique within the current
5878 bruce 1725 ECB : * database instance.
1726 : */
5789 tgl 1727 GIC 1783 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
5789 tgl 1728 ECB : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1729 :
7972 1730 : /*
1731 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1732 : * temp file that can be reused.
1733 : */
5789 tgl 1734 GIC 1783 : file = PathNameOpenFile(tempfilepath,
1735 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
8736 1736 1783 : if (file <= 0)
1737 : {
1738 : /*
1739 : * We might need to create the tablespace's tempfile directory, if no
5624 bruce 1740 ECB : * one has yet done so.
1741 : *
1742 : * Don't check for an error from MakePGDirectory; it could fail if
1743 : * someone else just did the same thing. If it doesn't work then
1828 sfrost 1744 : * we'll bomb out on the second create attempt, instead.
1745 : */
1828 sfrost 1746 CBC 71 : (void) MakePGDirectory(tempdirpath);
1747 :
5789 tgl 1748 71 : file = PathNameOpenFile(tempfilepath,
1749 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
5789 tgl 1750 GIC 71 : if (file <= 0 && rejectError)
7199 tgl 1751 LBC 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
7285 tgl 1752 EUB : tempfilepath);
1753 : }
1754 :
8736 tgl 1755 GIC 1783 : return file;
1756 : }
8736 tgl 1757 ECB :
1758 :
1759 : /*
1955 andres 1760 : * Create a new file. The directory containing it must already exist. Files
1761 : * created this way are subject to temp_file_limit and are automatically
1762 : * closed at end of transaction, but are not automatically deleted on close
1763 : * because they are intended to be shared between cooperating backends.
1764 : *
1765 : * If the file is inside the top-level temporary directory, its name should
1766 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1767 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1768 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1769 : * the prefix isn't needed.
1770 : */
1771 : File
1955 andres 1772 GIC 1610 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1773 : {
1774 : File file;
1775 :
610 1776 1610 : Assert(temporary_files_allowed); /* check temp file access is up */
610 andres 1777 ECB :
1955 andres 1778 CBC 1610 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1779 :
1955 andres 1780 ECB : /*
1781 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1782 : * temp file that can be reused.
1783 : */
1955 andres 1784 GIC 1610 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1785 1610 : if (file <= 0)
1786 : {
1955 andres 1787 CBC 171 : if (error_on_failure)
1955 andres 1788 LBC 0 : ereport(ERROR,
1789 : (errcode_for_file_access(),
1955 andres 1790 ECB : errmsg("could not create temporary file \"%s\": %m",
1791 : path)));
1955 andres 1792 EUB : else
1955 andres 1793 GBC 171 : return file;
1794 : }
1795 :
1796 : /* Mark it for temp_file_limit accounting. */
1797 1439 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1798 :
1799 : /* Register it for automatic close. */
1955 andres 1800 CBC 1439 : RegisterTemporaryFile(file);
1955 andres 1801 ECB :
1955 andres 1802 GIC 1439 : return file;
1803 : }
1955 andres 1804 EUB :
1805 : /*
1806 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1807 : * another backend. Files opened this way don't count against the
1808 : * temp_file_limit of the caller, are automatically closed at the end of the
1809 : * transaction but are not deleted on close.
1955 andres 1810 ECB : */
1811 : File
956 akapila 1812 GIC 4059 : PathNameOpenTemporaryFile(const char *path, int mode)
1813 : {
1814 : File file;
1815 :
610 andres 1816 4059 : Assert(temporary_files_allowed); /* check temp file access is up */
610 andres 1817 ECB :
1955 andres 1818 GIC 4059 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1819 :
956 akapila 1820 4059 : file = PathNameOpenFile(path, mode | PG_BINARY);
1955 andres 1821 ECB :
1822 : /* If no such file, then we don't raise an error. */
1955 andres 1823 GIC 4059 : if (file <= 0 && errno != ENOENT)
1955 andres 1824 UIC 0 : ereport(ERROR,
1825 : (errcode_for_file_access(),
1955 andres 1826 ECB : errmsg("could not open temporary file \"%s\": %m",
1827 : path)));
1828 :
1955 andres 1829 GIC 4059 : if (file > 0)
1830 : {
1955 andres 1831 ECB : /* Register it for automatic close. */
1955 andres 1832 GIC 1894 : RegisterTemporaryFile(file);
1833 : }
1834 :
1835 4059 : return file;
1836 : }
1955 andres 1837 EUB :
1838 : /*
1839 : * Delete a file by pathname. Return true if the file existed, false if
1840 : * didn't.
1955 andres 1841 ECB : */
1842 : bool
1955 andres 1843 GIC 3228 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1844 : {
1955 andres 1845 ECB : struct stat filestats;
1846 : int stat_errno;
1847 :
1848 : /* Get the final size for pgstat reporting. */
1955 andres 1849 GIC 3228 : if (stat(path, &filestats) != 0)
1850 1789 : stat_errno = errno;
1955 andres 1851 ECB : else
1955 andres 1852 CBC 1439 : stat_errno = 0;
1853 :
1854 : /*
1855 : * Unlike FileClose's automatic file deletion code, we tolerate
1856 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1857 : * many segments it has to delete until it runs out.
1955 andres 1858 ECB : */
1955 andres 1859 GIC 3228 : if (stat_errno == ENOENT)
1860 1789 : return false;
1861 :
1862 1439 : if (unlink(path) < 0)
1863 : {
1955 andres 1864 UIC 0 : if (errno != ENOENT)
1865 0 : ereport(error_on_failure ? ERROR : LOG,
1866 : (errcode_for_file_access(),
1867 : errmsg("could not unlink temporary file \"%s\": %m",
1868 : path)));
1869 0 : return false;
1955 andres 1870 ECB : }
1871 :
1955 andres 1872 GIC 1439 : if (stat_errno == 0)
1873 1439 : ReportTemporaryFileUsage(path, filestats.st_size);
1955 andres 1874 ECB : else
1955 andres 1875 EUB : {
1955 andres 1876 UIC 0 : errno = stat_errno;
1955 andres 1877 LBC 0 : ereport(LOG,
1878 : (errcode_for_file_access(),
1879 : errmsg("could not stat file \"%s\": %m", path)));
1955 andres 1880 ECB : }
1955 andres 1881 EUB :
1955 andres 1882 GIC 1439 : return true;
1883 : }
1884 :
1885 : /*
8736 tgl 1886 ECB : * close a file when done with it
1887 : */
1888 : void
9770 scrappy 1889 GIC 978713 : FileClose(File file)
9770 scrappy 1890 EUB : {
5624 bruce 1891 : Vfd *vfdP;
1892 :
8736 tgl 1893 GIC 978713 : Assert(FileIsValid(file));
1894 :
1895 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1896 : file, VfdCache[file].fileName));
1897 :
7728 tgl 1898 CBC 978713 : vfdP = &VfdCache[file];
7728 tgl 1899 ECB :
9345 bruce 1900 GIC 978713 : if (!FileIsNotOpen(file))
1901 : {
1902 : /* close the file */
1373 peter 1903 883076 : if (close(vfdP->fd) != 0)
1602 tmunro 1904 ECB : {
1905 : /*
1906 : * We may need to panic on failure to close non-temporary files;
1907 : * see LruDelete.
1908 : */
1602 tmunro 1909 UIC 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1910 : "could not close file \"%s\": %m", vfdP->fileName);
1911 : }
1912 :
9345 bruce 1913 GIC 883076 : --nfile;
7728 tgl 1914 883076 : vfdP->fd = VFD_CLOSED;
1915 :
1916 : /* remove the file from the lru ring */
2238 tgl 1917 CBC 883076 : Delete(file);
1918 : }
1919 :
1955 andres 1920 GIC 978713 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1921 : {
1955 andres 1922 ECB : /* Subtract its size from current usage (do first in case of error) */
1955 andres 1923 GIC 3222 : temporary_files_size -= vfdP->fileSize;
1924 3222 : vfdP->fileSize = 0;
1925 : }
1926 :
1927 : /*
4535 tgl 1928 ECB : * Delete the file if it was temporary, and make a log entry if wanted
9345 bruce 1929 : */
1955 andres 1930 GBC 978713 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1931 : {
4091 magnus 1932 ECB : struct stat filestats;
1933 : int stat_errno;
1934 :
4535 tgl 1935 : /*
1936 : * If we get an error, as could happen within the ereport/elog calls,
1937 : * we'll come right back here during transaction abort. Reset the
1938 : * flag to ensure that we can't get into an infinite loop. This code
1939 : * is arranged to ensure that the worst-case consequence is failing to
1940 : * emit log message(s), not failing to attempt the unlink.
1941 : */
1955 andres 1942 GIC 1783 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1943 :
1944 :
4091 magnus 1945 ECB : /* first try the stat() */
4091 magnus 1946 GIC 1783 : if (stat(vfdP->fileName, &filestats))
4091 magnus 1947 UIC 0 : stat_errno = errno;
1948 : else
4091 magnus 1949 CBC 1783 : stat_errno = 0;
1950 :
1951 : /* in any case do the unlink */
4091 magnus 1952 GIC 1783 : if (unlink(vfdP->fileName))
856 peter 1953 UIC 0 : ereport(LOG,
1954 : (errcode_for_file_access(),
856 peter 1955 ECB : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
4535 tgl 1956 EUB :
1957 : /* and last report the stat results */
4091 magnus 1958 CBC 1783 : if (stat_errno == 0)
1955 andres 1959 GBC 1783 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1960 : else
4089 magnus 1961 ECB : {
4089 magnus 1962 LBC 0 : errno = stat_errno;
856 peter 1963 UBC 0 : ereport(LOG,
1964 : (errcode_for_file_access(),
856 peter 1965 ECB : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
4535 tgl 1966 : }
8122 1967 : }
1968 :
1969 : /* Unregister it from the resource owner */
4875 heikki.linnakangas 1970 GIC 978713 : if (vfdP->resowner)
4875 heikki.linnakangas 1971 CBC 5110 : ResourceOwnerForgetFile(vfdP->resowner, file);
1972 :
1973 : /*
1974 : * Return the Vfd slot to the free list
1975 : */
8736 tgl 1976 GIC 978713 : FreeVfd(file);
9770 scrappy 1977 CBC 978713 : }
1978 :
1979 : /*
1980 : * FilePrefetch - initiate asynchronous read of a given range of the file.
1981 : *
1982 : * Currently the only implementation of this function is using posix_fadvise
1983 : * which is the simplest standardized interface that accomplishes this.
5200 tgl 1984 ECB : * We could add an implementation using libaio in the future; but note that
1985 : * this API is inappropriate for libaio, which wants to have a buffer provided
5200 tgl 1986 EUB : * to read into.
1987 : */
5200 tgl 1988 ECB : int
122 peter 1989 GNC 212103 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
5200 tgl 1990 ECB : {
1991 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1992 : int returnCode;
1993 :
5200 tgl 1994 GIC 212103 : Assert(FileIsValid(file));
5050 bruce 1995 ECB :
1996 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1997 : file, VfdCache[file].fileName,
1998 : (int64) offset, (int64) amount));
1999 :
5200 tgl 2000 GIC 212103 : returnCode = FileAccess(file);
2001 212103 : if (returnCode < 0)
5200 tgl 2002 UIC 0 : return returnCode;
2003 :
2213 rhaas 2004 GIC 212103 : pgstat_report_wait_start(wait_event_info);
5200 tgl 2005 212103 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2006 : POSIX_FADV_WILLNEED);
2213 rhaas 2007 212103 : pgstat_report_wait_end();
2008 :
5200 tgl 2009 212103 : return returnCode;
2010 : #else
2011 : Assert(FileIsValid(file));
2012 : return 0;
2013 : #endif
2014 : }
2015 :
2016 : void
2213 rhaas 2017 134022 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2018 : {
2606 andres 2019 EUB : int returnCode;
2020 :
2606 andres 2021 GIC 134022 : Assert(FileIsValid(file));
2022 :
2552 tgl 2023 ECB : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2024 : file, VfdCache[file].fileName,
2025 : (int64) offset, (int64) nbytes));
2026 :
2552 tgl 2027 CBC 134022 : if (nbytes <= 0)
2552 tgl 2028 UIC 0 : return;
2029 :
1 tmunro 2030 GNC 134022 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
1 tmunro 2031 UNC 0 : return;
2032 :
2606 andres 2033 GIC 134022 : returnCode = FileAccess(file);
2034 134022 : if (returnCode < 0)
2606 andres 2035 UIC 0 : return;
2606 andres 2036 ECB :
2213 rhaas 2037 GIC 134022 : pgstat_report_wait_start(wait_event_info);
2552 tgl 2038 134022 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2213 rhaas 2039 134022 : pgstat_report_wait_end();
2040 : }
2041 :
2042 : int
122 peter 2043 GNC 1721666 : FileRead(File file, void *buffer, size_t amount, off_t offset,
1614 tmunro 2044 ECB : uint32 wait_event_info)
9770 scrappy 2045 EUB : {
2046 : int returnCode;
2238 tgl 2047 ECB : Vfd *vfdP;
2048 :
8736 tgl 2049 GIC 1721666 : Assert(FileIsValid(file));
2050 :
2051 : DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
2052 : file, VfdCache[file].fileName,
2053 : (int64) offset,
2054 : amount, buffer));
2055 :
6887 2056 1721666 : returnCode = FileAccess(file);
6887 tgl 2057 CBC 1721666 : if (returnCode < 0)
6887 tgl 2058 UIC 0 : return returnCode;
6887 tgl 2059 EUB :
2238 tgl 2060 GIC 1721666 : vfdP = &VfdCache[file];
2238 tgl 2061 EUB :
6338 tgl 2062 GIC 1721666 : retry:
2213 rhaas 2063 GBC 1721666 : pgstat_report_wait_start(wait_event_info);
192 tmunro 2064 GIC 1721666 : returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2213 rhaas 2065 GBC 1721666 : pgstat_report_wait_end();
6338 tgl 2066 EUB :
1614 tmunro 2067 GBC 1721666 : if (returnCode < 0)
2068 : {
2069 : /*
2070 : * Windows may run out of kernel buffers and return "Insufficient
2071 : * system resources" error. Wait a bit and retry to solve it.
2072 : *
2073 : * It is rumored that EINTR is also possible on some Unix filesystems,
6338 tgl 2074 ECB : * in which case immediate retry is indicated.
2075 : */
2076 : #ifdef WIN32
6031 bruce 2077 : DWORD error = GetLastError();
6338 tgl 2078 :
2079 : switch (error)
2080 : {
2081 : case ERROR_NO_SYSTEM_RESOURCES:
6338 tgl 2082 EUB : pg_usleep(1000L);
2083 : errno = EINTR;
6338 tgl 2084 ECB : break;
2085 : default:
2086 : _dosmaperr(error);
2087 : break;
2088 : }
2089 : #endif
2090 : /* OK to retry if interrupted */
6338 tgl 2091 LBC 0 : if (errno == EINTR)
6338 tgl 2092 UIC 0 : goto retry;
6338 tgl 2093 ECB : }
2094 :
9345 bruce 2095 CBC 1721666 : return returnCode;
9770 scrappy 2096 ECB : }
2097 :
2098 : int
122 peter 2099 GNC 993512 : FileWrite(File file, const void *buffer, size_t amount, off_t offset,
2100 : uint32 wait_event_info)
2101 : {
2102 : int returnCode;
2103 : Vfd *vfdP;
2104 :
8736 tgl 2105 GIC 993512 : Assert(FileIsValid(file));
2106 :
2107 : DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
2108 : file, VfdCache[file].fileName,
2109 : (int64) offset,
2110 : amount, buffer));
2111 :
6887 2112 993512 : returnCode = FileAccess(file);
2113 993512 : if (returnCode < 0)
6887 tgl 2114 UIC 0 : return returnCode;
2115 :
2238 tgl 2116 GIC 993512 : vfdP = &VfdCache[file];
2117 :
2118 : /*
2119 : * If enforcing temp_file_limit and it's a temp file, check to see if the
3260 bruce 2120 EUB : * write would overrun temp_file_limit, and throw error if so. Note: it's
4284 tgl 2121 : * really a modularity violation to throw error here; we should set errno
2122 : * and return -1. However, there's no way to report a suitable error
2123 : * message if we do that. All current callers would just throw error
4284 tgl 2124 ECB : * immediately anyway, so this is safe at present.
2125 : */
1955 andres 2126 GIC 993512 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2127 : {
1614 tmunro 2128 LBC 0 : off_t past_write = offset + amount;
2129 :
1614 tmunro 2130 UIC 0 : if (past_write > vfdP->fileSize)
2131 : {
3955 bruce 2132 LBC 0 : uint64 newTotal = temporary_files_size;
2133 :
1614 tmunro 2134 UIC 0 : newTotal += past_write - vfdP->fileSize;
4284 tgl 2135 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2136 0 : ereport(ERROR,
4284 tgl 2137 ECB : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2118 2138 : errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2118 tgl 2139 EUB : temp_file_limit)));
2140 : }
4284 tgl 2141 ECB : }
2142 :
6338 tgl 2143 CBC 993512 : retry:
7977 tgl 2144 GIC 993512 : errno = 0;
2213 rhaas 2145 CBC 993512 : pgstat_report_wait_start(wait_event_info);
192 tmunro 2146 GIC 993512 : returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2213 rhaas 2147 993512 : pgstat_report_wait_end();
2148 :
2149 : /* if write didn't set errno, assume problem is no disk space */
7977 tgl 2150 993512 : if (returnCode != amount && errno == 0)
7977 tgl 2151 UIC 0 : errno = ENOSPC;
2152 :
6338 tgl 2153 GIC 993512 : if (returnCode >= 0)
2154 : {
2238 tgl 2155 ECB : /*
2156 : * Maintain fileSize and temporary_files_size if it's a temp file.
2157 : */
1955 andres 2158 GIC 993512 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2159 : {
1614 tmunro 2160 CBC 58193 : off_t past_write = offset + amount;
2161 :
1614 tmunro 2162 GIC 58193 : if (past_write > vfdP->fileSize)
2163 : {
2164 39327 : temporary_files_size += past_write - vfdP->fileSize;
2165 39327 : vfdP->fileSize = past_write;
4284 tgl 2166 ECB : }
2167 : }
4284 tgl 2168 EUB : }
2169 : else
6338 tgl 2170 ECB : {
2171 : /*
2172 : * See comments in FileRead()
2173 : */
2174 : #ifdef WIN32
6031 bruce 2175 EUB : DWORD error = GetLastError();
6338 tgl 2176 ECB :
2177 : switch (error)
2178 : {
6338 tgl 2179 EUB : case ERROR_NO_SYSTEM_RESOURCES:
2180 : pg_usleep(1000L);
2181 : errno = EINTR;
2182 : break;
2183 : default:
6338 tgl 2184 ECB : _dosmaperr(error);
2185 : break;
2186 : }
2187 : #endif
2188 : /* OK to retry if interrupted */
6338 tgl 2189 UIC 0 : if (errno == EINTR)
2190 0 : goto retry;
2191 : }
2192 :
9345 bruce 2193 GIC 993512 : return returnCode;
2194 : }
2195 :
2196 : int
2213 rhaas 2197 59883 : FileSync(File file, uint32 wait_event_info)
2198 : {
2199 : int returnCode;
6887 tgl 2200 ECB :
6887 tgl 2201 GIC 59883 : Assert(FileIsValid(file));
2202 :
2203 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2204 : file, VfdCache[file].fileName));
6887 tgl 2205 ECB :
6887 tgl 2206 GIC 59883 : returnCode = FileAccess(file);
2207 59883 : if (returnCode < 0)
6887 tgl 2208 UIC 0 : return returnCode;
2209 :
2213 rhaas 2210 GIC 59883 : pgstat_report_wait_start(wait_event_info);
2213 rhaas 2211 CBC 59883 : returnCode = pg_fsync(VfdCache[file].fd);
2212 59883 : pgstat_report_wait_end();
2213 rhaas 2213 EUB :
2213 rhaas 2214 GIC 59883 : return returnCode;
6887 tgl 2215 ECB : }
2216 :
2217 : /*
2218 : * Zero a region of the file.
2219 : *
2220 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2221 : * appropriate error.
2222 : */
2223 : int
4 andres 2224 GNC 343400 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2225 : {
2226 : int returnCode;
2227 : ssize_t written;
2228 :
2229 343400 : Assert(FileIsValid(file));
2230 :
2231 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2232 : file, VfdCache[file].fileName,
2233 : (int64) offset, (int64) amount));
2234 :
2235 343400 : returnCode = FileAccess(file);
2236 343400 : if (returnCode < 0)
4 andres 2237 UNC 0 : return returnCode;
2238 :
4 andres 2239 GNC 343400 : pgstat_report_wait_start(wait_event_info);
2240 343400 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2241 343400 : pgstat_report_wait_end();
2242 :
2243 343400 : if (written < 0)
4 andres 2244 UNC 0 : return -1;
4 andres 2245 GNC 343400 : else if (written != amount)
2246 : {
2247 : /* if errno is unset, assume problem is no disk space */
4 andres 2248 UNC 0 : if (errno == 0)
2249 0 : errno = ENOSPC;
2250 0 : return -1;
2251 : }
2252 :
4 andres 2253 GNC 343400 : return 0;
2254 : }
2255 :
2256 : /*
2257 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2258 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2259 : * use FileZero() instead.
2260 : *
2261 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2262 : * implemented by the filesystem. That's not the case for all environments
2263 : * though.
2264 : *
2265 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2266 : * appropriate error.
2267 : */
2268 : int
2269 385 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2270 : {
2271 : #ifdef HAVE_POSIX_FALLOCATE
2272 : int returnCode;
2273 :
2274 385 : Assert(FileIsValid(file));
2275 :
2276 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2277 : file, VfdCache[file].fileName,
2278 : (int64) offset, (int64) amount));
2279 :
2280 385 : returnCode = FileAccess(file);
2281 385 : if (returnCode < 0)
4 andres 2282 UNC 0 : return -1;
2283 :
4 andres 2284 GNC 385 : pgstat_report_wait_start(wait_event_info);
2285 385 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2286 385 : pgstat_report_wait_end();
2287 :
2288 385 : if (returnCode == 0)
2289 385 : return 0;
2290 :
2291 : /* for compatibility with %m printing etc */
4 andres 2292 UNC 0 : errno = returnCode;
2293 :
2294 : /*
2295 : * Return in cases of a "real" failure, if fallocate is not supported,
2296 : * fall through to the FileZero() backed implementation.
2297 : */
2298 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2299 0 : return -1;
2300 : #endif
2301 :
2302 0 : return FileZero(file, offset, amount, wait_event_info);
2303 : }
2304 :
5508 tgl 2305 ECB : off_t
1614 tmunro 2306 GIC 4252400 : FileSize(File file)
9770 scrappy 2307 ECB : {
8736 tgl 2308 CBC 4252400 : Assert(FileIsValid(file));
2309 :
2310 : DO_DB(elog(LOG, "FileSize %d (%s)",
1614 tmunro 2311 EUB : file, VfdCache[file].fileName));
2312 :
9345 bruce 2313 GIC 4252400 : if (FileIsNotOpen(file))
2314 : {
1614 tmunro 2315 79988 : if (FileAccess(file) < 0)
1614 tmunro 2316 UIC 0 : return (off_t) -1;
8313 tgl 2317 EUB : }
2238 2318 :
1614 tmunro 2319 GIC 4252400 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2320 : }
9770 scrappy 2321 EUB :
2322 : int
2213 rhaas 2323 GIC 396 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
2324 : {
9344 bruce 2325 ECB : int returnCode;
2326 :
8736 tgl 2327 CBC 396 : Assert(FileIsValid(file));
2328 :
2329 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2330 : file, VfdCache[file].fileName));
2331 :
6887 2332 396 : returnCode = FileAccess(file);
6887 tgl 2333 GIC 396 : if (returnCode < 0)
6887 tgl 2334 LBC 0 : return returnCode;
6887 tgl 2335 EUB :
2213 rhaas 2336 GIC 396 : pgstat_report_wait_start(wait_event_info);
5508 tgl 2337 396 : returnCode = ftruncate(VfdCache[file].fd, offset);
2213 rhaas 2338 CBC 396 : pgstat_report_wait_end();
2339 :
4284 tgl 2340 GIC 396 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2341 : {
4284 tgl 2342 ECB : /* adjust our state for truncation of a temp file */
1955 andres 2343 UIC 0 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
4284 tgl 2344 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2345 0 : VfdCache[file].fileSize = offset;
4284 tgl 2346 ECB : }
2347 :
8986 bruce 2348 GIC 396 : return returnCode;
2349 : }
2350 :
4995 heikki.linnakangas 2351 ECB : /*
2352 : * Return the pathname associated with an open file.
4995 heikki.linnakangas 2353 EUB : *
2354 : * The returned string points to an internal buffer, which is valid until
4995 heikki.linnakangas 2355 ECB : * the file is closed.
2356 : */
2357 : char *
4995 heikki.linnakangas 2358 UIC 0 : FilePathName(File file)
4995 heikki.linnakangas 2359 ECB : {
4995 heikki.linnakangas 2360 UIC 0 : Assert(FileIsValid(file));
2361 :
4995 heikki.linnakangas 2362 UBC 0 : return VfdCache[file].fileName;
4995 heikki.linnakangas 2363 EUB : }
2364 :
2365 : /*
2366 : * Return the raw file descriptor of an opened file.
2588 rhaas 2367 ECB : *
2368 : * The returned file descriptor will be valid until the file is closed, but
2369 : * there are a lot of things that can make that happen. So the caller should
2370 : * be careful not to do much of anything else before it finishes using the
2371 : * returned file descriptor.
2372 : */
2373 : int
2588 rhaas 2374 UIC 0 : FileGetRawDesc(File file)
2375 : {
2376 0 : Assert(FileIsValid(file));
2588 rhaas 2377 UBC 0 : return VfdCache[file].fd;
2378 : }
2588 rhaas 2379 EUB :
2380 : /*
2381 : * FileGetRawFlags - returns the file flags on open(2)
2382 : */
2383 : int
2588 rhaas 2384 UIC 0 : FileGetRawFlags(File file)
2385 : {
2386 0 : Assert(FileIsValid(file));
2387 0 : return VfdCache[file].fileFlags;
2388 : }
2389 :
2390 : /*
2391 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2392 : */
2024 peter_e 2393 EUB : mode_t
2588 rhaas 2394 UIC 0 : FileGetRawMode(File file)
2588 rhaas 2395 EUB : {
2588 rhaas 2396 UBC 0 : Assert(FileIsValid(file));
2588 rhaas 2397 UIC 0 : return VfdCache[file].fileMode;
2398 : }
2399 :
2400 : /*
2401 : * Make room for another allocatedDescs[] array entry if needed and possible.
2402 : * Returns true if an array element is available.
3591 tgl 2403 EUB : */
2404 : static bool
3591 tgl 2405 GBC 704666 : reserveAllocatedDesc(void)
3591 tgl 2406 EUB : {
2407 : AllocateDesc *newDescs;
2408 : int newMax;
2409 :
2410 : /* Quick out if array already has a free slot. */
3591 tgl 2411 GIC 704666 : if (numAllocatedDescs < maxAllocatedDescs)
2412 702833 : return true;
3591 tgl 2413 EUB :
2414 : /*
2415 : * If the array hasn't yet been created in the current process, initialize
1140 2416 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2417 : * we will ever need, anyway. We don't want to look at max_safe_fds
2418 : * immediately because set_max_safe_fds() may not have run yet.
2419 : */
3591 tgl 2420 GIC 1833 : if (allocatedDescs == NULL)
2421 : {
1140 2422 1833 : newMax = FD_MINFREE / 3;
3591 2423 1833 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
3591 tgl 2424 ECB : /* Out of memory already? Treat as fatal error. */
3591 tgl 2425 GIC 1833 : if (newDescs == NULL)
3591 tgl 2426 UIC 0 : ereport(ERROR,
2427 : (errcode(ERRCODE_OUT_OF_MEMORY),
2428 : errmsg("out of memory")));
3591 tgl 2429 GIC 1833 : allocatedDescs = newDescs;
3591 tgl 2430 CBC 1833 : maxAllocatedDescs = newMax;
2431 1833 : return true;
2432 : }
2433 :
2434 : /*
2435 : * Consider enlarging the array beyond the initial allocation used above.
2436 : * By the time this happens, max_safe_fds should be known accurately.
2437 : *
2438 : * We mustn't let allocated descriptors hog all the available FDs, and in
3591 tgl 2439 ECB : * practice we'd better leave a reasonable number of FDs for VFD use. So
2440 : * set the maximum to max_safe_fds / 3. (This should certainly be at
1140 2441 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2442 : * tightening the restriction here.) Recall that "external" FDs are
2443 : * allowed to consume another third of max_safe_fds.
3591 2444 : */
1140 tgl 2445 UBC 0 : newMax = max_safe_fds / 3;
3591 tgl 2446 UIC 0 : if (newMax > maxAllocatedDescs)
2447 : {
3591 tgl 2448 LBC 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
3591 tgl 2449 ECB : newMax * sizeof(AllocateDesc));
2450 : /* Treat out-of-memory as a non-fatal error. */
3591 tgl 2451 UIC 0 : if (newDescs == NULL)
2452 0 : return false;
2453 0 : allocatedDescs = newDescs;
2454 0 : maxAllocatedDescs = newMax;
2455 0 : return true;
2456 : }
2457 :
2458 : /* Can't enlarge allocatedDescs[] any more. */
2459 0 : return false;
2460 : }
2461 :
2462 : /*
2463 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
8736 tgl 2464 EUB : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
3260 bruce 2465 : * necessary to open the file. When done, call FreeFile rather than fclose.
2466 : *
8736 tgl 2467 : * Note that files that will be open for any significant length of time
2468 : * should NOT be handled this way, since they cannot share kernel file
2469 : * descriptors with other files; there is grave risk of running out of FDs
2470 : * if anyone locks down too many FDs. Most callers of this routine are
2471 : * simply reading a config file that they will read and close immediately.
9770 scrappy 2472 : *
8736 tgl 2473 : * fd.c will automatically close all files opened with AllocateFile at
2474 : * transaction commit or abort; this prevents FD leakage if a routine
2475 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2476 : *
2477 : * Ideally this should be the *only* direct call of fopen() in the backend.
9770 scrappy 2478 : */
2479 : FILE *
6245 tgl 2480 GIC 63940 : AllocateFile(const char *name, const char *mode)
2481 : {
2482 : FILE *file;
2483 :
2484 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2485 : numAllocatedDescs, name));
2486 :
2487 : /* Can we allocate another non-virtual FD? */
3591 2488 63940 : if (!reserveAllocatedDesc())
3591 tgl 2489 UIC 0 : ereport(ERROR,
2490 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2491 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2492 : maxAllocatedDescs, name)));
2493 :
2494 : /* Close excess kernel FDs. */
3591 tgl 2495 GIC 63940 : ReleaseLruFiles();
2496 :
9365 bruce 2497 63940 : TryAgain:
8260 tgl 2498 63940 : if ((file = fopen(name, mode)) != NULL)
9345 bruce 2499 ECB : {
6829 tgl 2500 GIC 57546 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2501 :
2502 57546 : desc->kind = AllocateDescFile;
2503 57546 : desc->desc.file = file;
6779 2504 57546 : desc->create_subid = GetCurrentSubTransactionId();
6829 2505 57546 : numAllocatedDescs++;
2506 57546 : return desc->desc.file;
8260 tgl 2507 ECB : }
8260 tgl 2508 EUB :
8260 tgl 2509 GIC 6394 : if (errno == EMFILE || errno == ENFILE)
2510 : {
8053 bruce 2511 UIC 0 : int save_errno = errno;
2512 :
7199 tgl 2513 0 : ereport(LOG,
7199 tgl 2514 ECB : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2515 : errmsg("out of file descriptors: %m; release and retry")));
8260 tgl 2516 LBC 0 : errno = 0;
2517 0 : if (ReleaseLruFile())
9345 bruce 2518 UIC 0 : goto TryAgain;
8260 tgl 2519 LBC 0 : errno = save_errno;
2520 : }
8260 tgl 2521 ECB :
8260 tgl 2522 CBC 6394 : return NULL;
9770 scrappy 2523 ECB : }
2524 :
2024 peter_e 2525 : /*
2526 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2527 : * the fileMode parameter.
2528 : */
2529 : int
2024 peter_e 2530 GBC 599546 : OpenTransientFile(const char *fileName, int fileFlags)
2531 : {
1828 sfrost 2532 599546 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2533 : }
2534 :
6829 tgl 2535 EUB : /*
3785 heikki.linnakangas 2536 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2537 : */
2538 : int
2024 peter_e 2539 GIC 599552 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2540 : {
3785 heikki.linnakangas 2541 ECB : int fd;
2542 :
2543 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2544 : numAllocatedDescs, fileName));
2545 :
2546 : /* Can we allocate another non-virtual FD? */
3591 tgl 2547 GIC 599552 : if (!reserveAllocatedDesc())
3591 tgl 2548 UIC 0 : ereport(ERROR,
3591 tgl 2549 ECB : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2550 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2551 : maxAllocatedDescs, fileName)));
2552 :
2553 : /* Close excess kernel FDs. */
3591 tgl 2554 GIC 599552 : ReleaseLruFiles();
2555 :
2024 peter_e 2556 599552 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2557 :
3785 heikki.linnakangas 2558 CBC 599552 : if (fd >= 0)
2559 : {
3785 heikki.linnakangas 2560 GIC 596644 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2561 :
2562 596644 : desc->kind = AllocateDescRawFD;
2563 596644 : desc->desc.fd = fd;
2564 596644 : desc->create_subid = GetCurrentSubTransactionId();
2565 596644 : numAllocatedDescs++;
3785 heikki.linnakangas 2566 ECB :
3785 heikki.linnakangas 2567 GBC 596644 : return fd;
2568 : }
2569 :
3785 heikki.linnakangas 2570 GIC 2908 : return -1; /* failure */
2571 : }
2572 :
3693 heikki.linnakangas 2573 ECB : /*
2574 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2575 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2576 : * necessary. When done, call ClosePipeStream rather than pclose.
1602 tgl 2577 : *
2578 : * This function also ensures that the popen'd program is run with default
2579 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2580 : * uses. This ensures desirable response to, eg, closing a read pipe early.
3693 heikki.linnakangas 2581 : */
2582 : FILE *
3693 heikki.linnakangas 2583 CBC 311 : OpenPipeStream(const char *command, const char *mode)
3693 heikki.linnakangas 2584 ECB : {
2585 : FILE *file;
1602 tgl 2586 : int save_errno;
2587 :
2588 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
3693 heikki.linnakangas 2589 : numAllocatedDescs, command));
2590 :
2591 : /* Can we allocate another non-virtual FD? */
3591 tgl 2592 GIC 311 : if (!reserveAllocatedDesc())
3591 tgl 2593 UIC 0 : ereport(ERROR,
2594 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2595 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2596 : maxAllocatedDescs, command)));
2597 :
2598 : /* Close excess kernel FDs. */
3591 tgl 2599 GIC 311 : ReleaseLruFiles();
2600 :
3693 heikki.linnakangas 2601 311 : TryAgain:
223 tgl 2602 GNC 311 : fflush(NULL);
1602 tgl 2603 GIC 311 : pqsignal(SIGPIPE, SIG_DFL);
3693 heikki.linnakangas 2604 311 : errno = 0;
1602 tgl 2605 311 : file = popen(command, mode);
2606 311 : save_errno = errno;
2607 311 : pqsignal(SIGPIPE, SIG_IGN);
2608 311 : errno = save_errno;
2609 311 : if (file != NULL)
3693 heikki.linnakangas 2610 ECB : {
3693 heikki.linnakangas 2611 GBC 311 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2612 :
3693 heikki.linnakangas 2613 GIC 311 : desc->kind = AllocateDescPipe;
2614 311 : desc->desc.file = file;
2615 311 : desc->create_subid = GetCurrentSubTransactionId();
2616 311 : numAllocatedDescs++;
3693 heikki.linnakangas 2617 CBC 311 : return desc->desc.file;
2618 : }
3693 heikki.linnakangas 2619 ECB :
3693 heikki.linnakangas 2620 LBC 0 : if (errno == EMFILE || errno == ENFILE)
3693 heikki.linnakangas 2621 ECB : {
3693 heikki.linnakangas 2622 LBC 0 : ereport(LOG,
3693 heikki.linnakangas 2623 ECB : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2624 : errmsg("out of file descriptors: %m; release and retry")));
3693 heikki.linnakangas 2625 LBC 0 : if (ReleaseLruFile())
2626 0 : goto TryAgain;
2627 0 : errno = save_errno;
2628 : }
3693 heikki.linnakangas 2629 ECB :
3693 heikki.linnakangas 2630 UIC 0 : return NULL;
3693 heikki.linnakangas 2631 ECB : }
2632 :
3785 2633 : /*
2634 : * Free an AllocateDesc of any type.
6829 tgl 2635 : *
2636 : * The argument *must* point into the allocatedDescs[] array.
2637 : */
6829 tgl 2638 EUB : static int
6829 tgl 2639 GIC 694748 : FreeDesc(AllocateDesc *desc)
6829 tgl 2640 EUB : {
2641 : int result;
2642 :
2643 : /* Close the underlying object */
6829 tgl 2644 GBC 694748 : switch (desc->kind)
6829 tgl 2645 EUB : {
6829 tgl 2646 GIC 57546 : case AllocateDescFile:
2647 57546 : result = fclose(desc->desc.file);
6829 tgl 2648 GBC 57546 : break;
3693 heikki.linnakangas 2649 GIC 311 : case AllocateDescPipe:
2650 311 : result = pclose(desc->desc.file);
2651 311 : break;
6829 tgl 2652 40247 : case AllocateDescDir:
2653 40247 : result = closedir(desc->desc.dir);
2654 40247 : break;
3785 heikki.linnakangas 2655 596644 : case AllocateDescRawFD:
2656 596644 : result = close(desc->desc.fd);
3785 heikki.linnakangas 2657 CBC 596644 : break;
6829 tgl 2658 UIC 0 : default:
2659 0 : elog(ERROR, "AllocateDesc kind not recognized");
2660 : result = 0; /* keep compiler quiet */
2661 : break;
6829 tgl 2662 ECB : }
2663 :
2664 : /* Compact storage in the allocatedDescs array */
6829 tgl 2665 CBC 694748 : numAllocatedDescs--;
2666 694748 : *desc = allocatedDescs[numAllocatedDescs];
6829 tgl 2667 ECB :
6829 tgl 2668 CBC 694748 : return result;
6829 tgl 2669 ECB : }
2670 :
7013 2671 : /*
2672 : * Close a file returned by AllocateFile.
2673 : *
2674 : * Note we do not check fclose's return value --- it is up to the caller
2675 : * to handle close errors.
7013 tgl 2676 EUB : */
2677 : int
9344 bruce 2678 GIC 57537 : FreeFile(FILE *file)
2679 : {
2680 : int i;
2681 :
2682 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
8736 tgl 2683 ECB :
2684 : /* Remove file from list of allocated files, if it's present */
6829 tgl 2685 GIC 57538 : for (i = numAllocatedDescs; --i >= 0;)
8736 tgl 2686 ECB : {
6829 tgl 2687 GIC 57538 : AllocateDesc *desc = &allocatedDescs[i];
2688 :
2689 57538 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2690 57537 : return FreeDesc(desc);
2691 : }
2692 :
2693 : /* Only get here if someone passes us a file not in allocatedDescs */
6829 tgl 2694 UIC 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2695 :
7013 tgl 2696 LBC 0 : return fclose(file);
2697 : }
2698 :
2699 : /*
2700 : * Close a file returned by OpenTransientFile.
2701 : *
2702 : * Note we do not check close's return value --- it is up to the caller
3785 heikki.linnakangas 2703 ECB : * to handle close errors.
2704 : */
2705 : int
3785 heikki.linnakangas 2706 GIC 596643 : CloseTransientFile(int fd)
3785 heikki.linnakangas 2707 ECB : {
2708 : int i;
2709 :
2710 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2711 :
3785 heikki.linnakangas 2712 EUB : /* Remove fd from list of allocated files, if it's present */
3785 heikki.linnakangas 2713 GIC 596643 : for (i = numAllocatedDescs; --i >= 0;)
3785 heikki.linnakangas 2714 EUB : {
3785 heikki.linnakangas 2715 GIC 596643 : AllocateDesc *desc = &allocatedDescs[i];
2716 :
2717 596643 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2718 596643 : return FreeDesc(desc);
2719 : }
2720 :
2721 : /* Only get here if someone passes us a file not in allocatedDescs */
3785 heikki.linnakangas 2722 UIC 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2723 :
3785 heikki.linnakangas 2724 LBC 0 : return close(fd);
2725 : }
2726 :
2727 : /*
2728 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2729 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2730 : * necessary to open the directory, and with closing it after an elog.
6985 tgl 2731 ECB : * When done, call FreeDir rather than closedir.
2732 : *
1952 2733 : * Returns NULL, with errno set, on failure. Note that failure detection
2734 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2735 : * see the comments for ReadDir.
2736 : *
2737 : * Ideally this should be the *only* direct call of opendir() in the backend.
2738 : */
2739 : DIR *
6985 tgl 2740 GBC 40863 : AllocateDir(const char *dirname)
2741 : {
6797 bruce 2742 EUB : DIR *dir;
2743 :
2744 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2745 : numAllocatedDescs, dirname));
2746 :
2747 : /* Can we allocate another non-virtual FD? */
3591 tgl 2748 GIC 40863 : if (!reserveAllocatedDesc())
3591 tgl 2749 UIC 0 : ereport(ERROR,
2750 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2751 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2752 : maxAllocatedDescs, dirname)));
2753 :
2754 : /* Close excess kernel FDs. */
3591 tgl 2755 GIC 40863 : ReleaseLruFiles();
2756 :
6985 2757 40863 : TryAgain:
6985 tgl 2758 CBC 40863 : if ((dir = opendir(dirname)) != NULL)
2759 : {
6829 tgl 2760 GIC 40247 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2761 :
2762 40247 : desc->kind = AllocateDescDir;
2763 40247 : desc->desc.dir = dir;
6779 2764 40247 : desc->create_subid = GetCurrentSubTransactionId();
6829 2765 40247 : numAllocatedDescs++;
6829 tgl 2766 CBC 40247 : return desc->desc.dir;
6985 tgl 2767 EUB : }
2768 :
6985 tgl 2769 GIC 616 : if (errno == EMFILE || errno == ENFILE)
2770 : {
6985 tgl 2771 UIC 0 : int save_errno = errno;
2772 :
6985 tgl 2773 LBC 0 : ereport(LOG,
2774 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
6385 bruce 2775 ECB : errmsg("out of file descriptors: %m; release and retry")));
6985 tgl 2776 LBC 0 : errno = 0;
6985 tgl 2777 UIC 0 : if (ReleaseLruFile())
6985 tgl 2778 LBC 0 : goto TryAgain;
6985 tgl 2779 UIC 0 : errno = save_errno;
6985 tgl 2780 ECB : }
2781 :
6985 tgl 2782 CBC 616 : return NULL;
6985 tgl 2783 ECB : }
2784 :
2785 : /*
2786 : * Read a directory opened with AllocateDir, ereport'ing any error.
6503 2787 : *
2788 : * This is easier to use than raw readdir() since it takes care of some
3260 bruce 2789 EUB : * otherwise rather tedious and error-prone manipulation of errno. Also,
2790 : * if you are happy with a generic error message for AllocateDir failure,
6503 tgl 2791 : * you can just do
2792 : *
2793 : * dir = AllocateDir(path);
2794 : * while ((dirent = ReadDir(dir, path)) != NULL)
2795 : * process dirent;
6331 2796 : * FreeDir(dir);
6503 2797 : *
2798 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2799 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
1952 tgl 2800 ECB : * use this shortcut.)
2801 : *
2802 : * The pathname passed to AllocateDir must be passed to this routine too,
2803 : * but it is only used for error reporting.
2804 : */
2805 : struct dirent *
6503 tgl 2806 GIC 1011151 : ReadDir(DIR *dir, const char *dirname)
2807 : {
2873 2808 1011151 : return ReadDirExtended(dir, dirname, ERROR);
2809 : }
2810 :
2811 : /*
2812 : * Alternate version of ReadDir that allows caller to specify the elevel
2813 : * for any error report (whether it's reporting an initial failure of
2814 : * AllocateDir or a subsequent directory read failure).
2815 : *
2816 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2817 : * pattern, this will result in falling out of the loop immediately as
2818 : * though the directory contained no (more) entries.
2819 : */
2820 : struct dirent *
2821 1898454 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2822 : {
2823 : struct dirent *dent;
6503 tgl 2824 ECB :
2825 : /* Give a generic message for AllocateDir failure, if caller didn't */
6503 tgl 2826 CBC 1898454 : if (dir == NULL)
2827 : {
2873 tgl 2828 GIC 3 : ereport(elevel,
2829 : (errcode_for_file_access(),
2830 : errmsg("could not open directory \"%s\": %m",
2831 : dirname)));
2873 tgl 2832 UIC 0 : return NULL;
2833 : }
2834 :
6503 tgl 2835 GIC 1898451 : errno = 0;
2836 1898451 : if ((dent = readdir(dir)) != NULL)
2837 1865925 : return dent;
2838 :
6503 tgl 2839 CBC 32526 : if (errno)
2873 tgl 2840 UIC 0 : ereport(elevel,
2841 : (errcode_for_file_access(),
2842 : errmsg("could not read directory \"%s\": %m",
2843 : dirname)));
6503 tgl 2844 CBC 32526 : return NULL;
2845 : }
6503 tgl 2846 ECB :
2847 : /*
2848 : * Close a directory opened with AllocateDir.
2849 : *
1952 tgl 2850 EUB : * Returns closedir's return value (with errno set if it's not 0).
2851 : * Note we do not check the return value --- it is up to the caller
2852 : * to handle close errors if wanted.
1952 tgl 2853 ECB : *
2854 : * Does nothing if dir == NULL; we assume that directory open failure was
2855 : * already reported if desired.
2856 : */
6985 2857 : int
6985 tgl 2858 GBC 40158 : FreeDir(DIR *dir)
2859 : {
2860 : int i;
2861 :
1952 tgl 2862 ECB : /* Nothing to do if AllocateDir failed */
1952 tgl 2863 GIC 40158 : if (dir == NULL)
1952 tgl 2864 UIC 0 : return 0;
2865 :
2866 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2867 :
2868 : /* Remove dir from list of allocated dirs, if it's present */
6829 tgl 2869 GIC 40158 : for (i = numAllocatedDescs; --i >= 0;)
2870 : {
2871 40158 : AllocateDesc *desc = &allocatedDescs[i];
2872 :
2873 40158 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2874 40158 : return FreeDesc(desc);
2875 : }
6829 tgl 2876 ECB :
2877 : /* Only get here if someone passes us a dir not in allocatedDescs */
6829 tgl 2878 UIC 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2879 :
6985 2880 0 : return closedir(dir);
6985 tgl 2881 ECB : }
6985 tgl 2882 EUB :
2883 :
2884 : /*
2885 : * Close a pipe stream returned by OpenPipeStream.
2886 : */
3693 heikki.linnakangas 2887 ECB : int
3693 heikki.linnakangas 2888 GIC 311 : ClosePipeStream(FILE *file)
3693 heikki.linnakangas 2889 ECB : {
2890 : int i;
2891 :
2892 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2893 :
2894 : /* Remove file from list of allocated files, if it's present */
3693 heikki.linnakangas 2895 GIC 311 : for (i = numAllocatedDescs; --i >= 0;)
3693 heikki.linnakangas 2896 EUB : {
3693 heikki.linnakangas 2897 GIC 311 : AllocateDesc *desc = &allocatedDescs[i];
3693 heikki.linnakangas 2898 EUB :
3693 heikki.linnakangas 2899 GIC 311 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2900 311 : return FreeDesc(desc);
2901 : }
2902 :
2903 : /* Only get here if someone passes us a file not in allocatedDescs */
3693 heikki.linnakangas 2904 UIC 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2905 :
3693 heikki.linnakangas 2906 LBC 0 : return pclose(file);
2907 : }
2908 :
2909 : /*
2910 : * closeAllVfds
2911 : *
2912 : * Force all VFDs into the physically-closed state, so that the fewest
8736 tgl 2913 ECB : * possible number of kernel file descriptors are in use. There is no
2914 : * change in the logical state of the VFDs.
2915 : */
2916 : void
8260 tgl 2917 CBC 26 : closeAllVfds(void)
9770 scrappy 2918 ECB : {
2919 : Index i;
2920 :
8736 tgl 2921 GIC 26 : if (SizeVfdCache > 0)
8736 tgl 2922 EUB : {
2118 tgl 2923 GIC 26 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
8736 tgl 2924 GBC 832 : for (i = 1; i < SizeVfdCache; i++)
2925 : {
8736 tgl 2926 GIC 806 : if (!FileIsNotOpen(i))
2927 64 : LruDelete(i);
2928 : }
2929 : }
2930 26 : }
2931 :
2932 :
2933 : /*
2934 : * SetTempTablespaces
5785 tgl 2935 ECB : *
2936 : * Define a list (actually an array) of OIDs of tablespaces to use for
2937 : * temporary files. This list will be used until end of transaction,
2938 : * unless this function is called again before then. It is caller's
2939 : * responsibility that the passed-in array has adequate lifespan (typically
2940 : * it'd be allocated in TopTransactionContext).
1010 2941 : *
2942 : * Some entries of the array may be InvalidOid, indicating that the current
2943 : * database's default tablespace should be used.
5785 2944 : */
2945 : void
5785 tgl 2946 GIC 3645 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2947 : {
5785 tgl 2948 CBC 3645 : Assert(numSpaces >= 0);
5785 tgl 2949 GIC 3645 : tempTableSpaces = tableSpaces;
2950 3645 : numTempTableSpaces = numSpaces;
2951 :
2952 : /*
2953 : * Select a random starting point in the list. This is to minimize
2954 : * conflicts between backends that are most likely sharing the same list
2955 : * of temp tablespaces. Note that if we create multiple temp files in the
2956 : * same transaction, we'll advance circularly through the list --- this
2957 : * ensures that large temporary sort files are nicely spread across all
2958 : * available tablespaces.
2959 : */
2960 3645 : if (numSpaces > 1)
497 tgl 2961 UIC 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
2962 0 : 0, numSpaces - 1);
2963 : else
5785 tgl 2964 CBC 3645 : nextTempTableSpace = 0;
5785 tgl 2965 GIC 3645 : }
5785 tgl 2966 ECB :
2967 : /*
2968 : * TempTablespacesAreSet
2969 : *
2970 : * Returns true if SetTempTablespaces has been called in current transaction.
2971 : * (This is just so that tablespaces.c doesn't need its own per-transaction
2972 : * state.)
2973 : */
2974 : bool
5785 tgl 2975 GIC 4525 : TempTablespacesAreSet(void)
2976 : {
2977 4525 : return (numTempTableSpaces >= 0);
5785 tgl 2978 ECB : }
5785 tgl 2979 EUB :
1955 andres 2980 : /*
2981 : * GetTempTablespaces
1955 andres 2982 ECB : *
2983 : * Populate an array with the OIDs of the tablespaces that should be used for
2984 : * temporary files. (Some entries may be InvalidOid, indicating that the
2985 : * current database's default tablespace should be used.) At most numSpaces
2986 : * entries will be filled.
2987 : * Returns the number of OIDs that were copied into the output array.
2988 : */
2989 : int
1955 andres 2990 GIC 174 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2991 : {
2992 : int i;
1955 andres 2993 ECB :
1955 andres 2994 GIC 174 : Assert(TempTablespacesAreSet());
1955 andres 2995 CBC 174 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
1955 andres 2996 UIC 0 : tableSpaces[i] = tempTableSpaces[i];
2997 :
1955 andres 2998 GIC 174 : return i;
2999 : }
3000 :
3001 : /*
3002 : * GetNextTempTableSpace
3003 : *
3004 : * Select the next temp tablespace to use. A result of InvalidOid means
3005 : * to use the current database's default tablespace.
3006 : */
3007 : Oid
5785 tgl 3008 CBC 1896 : GetNextTempTableSpace(void)
3009 : {
5785 tgl 3010 GIC 1896 : if (numTempTableSpaces > 0)
3011 : {
5785 tgl 3012 ECB : /* Advance nextTempTableSpace counter with wraparound */
5785 tgl 3013 CBC 1 : if (++nextTempTableSpace >= numTempTableSpaces)
5785 tgl 3014 GBC 1 : nextTempTableSpace = 0;
5785 tgl 3015 GIC 1 : return tempTableSpaces[nextTempTableSpace];
5785 tgl 3016 ECB : }
5785 tgl 3017 GIC 1895 : return InvalidOid;
3018 : }
3019 :
3020 :
3021 : /*
3022 : * AtEOSubXact_Files
3023 : *
3024 : * Take care of subtransaction commit/abort. At abort, we close temp files
3025 : * that the subtransaction may have opened. At commit, we reassign the
6779 tgl 3026 ECB : * files that were opened to the parent subtransaction.
3027 : */
6829 3028 : void
6779 tgl 3029 GIC 8795 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3030 : SubTransactionId parentSubid)
6829 tgl 3031 ECB : {
6797 bruce 3032 : Index i;
6829 tgl 3033 :
6829 tgl 3034 GIC 8795 : for (i = 0; i < numAllocatedDescs; i++)
6829 tgl 3035 ECB : {
6779 tgl 3036 UIC 0 : if (allocatedDescs[i].create_subid == mySubid)
3037 : {
6829 3038 0 : if (isCommit)
6779 3039 0 : allocatedDescs[i].create_subid = parentSubid;
3040 : else
3041 : {
3042 : /* have to recheck the item after FreeDesc (ugly) */
6829 3043 0 : FreeDesc(&allocatedDescs[i--]);
3044 : }
3045 : }
3046 : }
6829 tgl 3047 CBC 8795 : }
3048 :
3049 : /*
3050 : * AtEOXact_Files
3051 : *
1807 tgl 3052 ECB : * This routine is called during transaction commit or abort. All still-open
3053 : * per-transaction temporary file VFDs are closed, which also causes the
1807 tgl 3054 EUB : * underlying files to be deleted (although they should've been closed already
3055 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3056 : * closed. We also forget any transaction-local temp tablespace list.
3057 : *
3058 : * The isCommit flag is used only to decide whether to emit warnings about
3059 : * unclosed files.
3060 : */
8736 3061 : void
1807 tgl 3062 GIC 486167 : AtEOXact_Files(bool isCommit)
3063 : {
3064 486167 : CleanupTempFiles(isCommit, false);
5785 tgl 3065 CBC 486167 : tempTableSpaces = NULL;
5785 tgl 3066 GIC 486167 : numTempTableSpaces = -1;
7285 3067 486167 : }
3068 :
3069 : /*
3070 : * BeforeShmemExit_Files
3071 : *
3072 : * before_shmem_access hook to clean up temp files during backend shutdown.
3073 : * Here, we want to clean up *all* temp files including interXact ones.
3074 : */
3075 : static void
610 andres 3076 13291 : BeforeShmemExit_Files(int code, Datum arg)
3077 : {
1807 tgl 3078 13291 : CleanupTempFiles(false, true);
3079 :
610 andres 3080 ECB : /* prevent further temp files from being created */
3081 : #ifdef USE_ASSERT_CHECKING
610 andres 3082 CBC 13291 : temporary_files_allowed = false;
610 andres 3083 ECB : #endif
7285 tgl 3084 CBC 13291 : }
7285 tgl 3085 ECB :
3086 : /*
3087 : * Close temporary files and delete their underlying files.
3088 : *
3089 : * isCommit: if true, this is normal transaction commit, and we don't
3090 : * expect any remaining files; warn if there are some.
3091 : *
3092 : * isProcExit: if true, this is being called as the backend process is
3093 : * exiting. If that's the case, we should remove all temporary files; if
3094 : * that's not the case, we are being called for transaction commit/abort
3095 : * and should only remove transaction-local temp files. In either case,
3785 heikki.linnakangas 3096 : * also clean up "allocated" stdio files, dirs and fds.
3097 : */
3098 : static void
1807 tgl 3099 GIC 499458 : CleanupTempFiles(bool isCommit, bool isProcExit)
7285 tgl 3100 ECB : {
3101 : Index i;
8736 3102 :
3103 : /*
3104 : * Careful here: at proc_exit we need extra cleanup, not just
3105 : * xact_temporary files.
3106 : */
3826 tgl 3107 GIC 499458 : if (isProcExit || have_xact_temporary_files)
3108 : {
2118 3109 14042 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
8736 3110 843516 : for (i = 1; i < SizeVfdCache; i++)
3111 : {
7285 3112 829474 : unsigned short fdstate = VfdCache[i].fdstate;
3113 :
1955 andres 3114 829474 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3115 5 : VfdCache[i].fileName != NULL)
3116 : {
3826 tgl 3117 ECB : /*
3118 : * If we're in the process of exiting a backend process, close
3119 : * all temporary files. Otherwise, only close temporary files
3120 : * local to the current transaction. They should be closed by
3121 : * the ResourceOwner mechanism already, so this is just a
3122 : * debugging cross-check.
3123 : */
3826 tgl 3124 GIC 5 : if (isProcExit)
3826 tgl 3125 CBC 5 : FileClose(i);
1955 andres 3126 UIC 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
4875 heikki.linnakangas 3127 ECB : {
3826 tgl 3128 LBC 0 : elog(WARNING,
3129 : "temporary file %s not closed at end-of-transaction",
3826 tgl 3130 ECB : VfdCache[i].fileName);
3826 tgl 3131 UIC 0 : FileClose(i);
4875 heikki.linnakangas 3132 ECB : }
7285 tgl 3133 : }
3134 : }
3135 :
3826 tgl 3136 GIC 14042 : have_xact_temporary_files = false;
3137 : }
3138 :
3139 : /* Complain if any allocated files remain open at commit. */
1807 3140 499458 : if (isCommit && numAllocatedDescs > 0)
1807 tgl 3141 UIC 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
1807 tgl 3142 ECB : numAllocatedDescs);
3143 :
3785 heikki.linnakangas 3144 EUB : /* Clean up "allocated" stdio files, dirs and fds. */
6829 tgl 3145 GIC 499557 : while (numAllocatedDescs > 0)
6829 tgl 3146 GBC 99 : FreeDesc(&allocatedDescs[0]);
8736 tgl 3147 GIC 499458 : }
3148 :
7972 tgl 3149 EUB :
3150 : /*
3151 : * Remove temporary and temporary relation files left over from a prior
3152 : * postmaster session
3153 : *
7972 tgl 3154 ECB : * This should be called during postmaster startup. It will forcibly
3155 : * remove any leftover files created by OpenTemporaryFile and any leftover
3156 : * temporary relation files created by mdcreate.
3157 : *
752 tomas.vondra 3158 : * During post-backend-crash restart cycle, this routine is called when
752 tomas.vondra 3159 EUB : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3160 : * queries are using temp files could result in useless storage usage that can
3161 : * only be reclaimed by a service restart. The argument against enabling it is
3162 : * that someone might want to examine the temporary files for debugging
752 tomas.vondra 3163 ECB : * purposes. This does however mean that OpenTemporaryFile had better allow for
3164 : * collision with an existing temp file name.
1952 tgl 3165 : *
3166 : * NOTE: this function and its subroutines generally report syscall failures
3167 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3168 : * that we should fail to start the database when we can't do it.
3169 : */
3170 : void
7972 tgl 3171 GIC 596 : RemovePgTempFiles(void)
3172 : {
3173 : char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3174 : DIR *spc_dir;
3175 : struct dirent *spc_de;
3176 :
3177 : /*
3178 : * First process temp files in pg_default ($PGDATA/base)
3179 : */
5789 3180 596 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
1918 3181 596 : RemovePgTempFilesInDir(temp_path, true, false);
4622 rhaas 3182 596 : RemovePgTempRelationFiles("base");
3183 :
3184 : /*
3185 : * Cycle through temp directories for all non-default tablespaces.
3186 : */
5789 tgl 3187 596 : spc_dir = AllocateDir("pg_tblspc");
3188 :
1952 tgl 3189 CBC 1839 : while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3190 : {
5789 tgl 3191 GIC 1243 : if (strcmp(spc_de->d_name, ".") == 0 ||
3192 647 : strcmp(spc_de->d_name, "..") == 0)
6675 3193 1192 : continue;
3194 :
4835 bruce 3195 51 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2118 tgl 3196 51 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
1918 3197 51 : RemovePgTempFilesInDir(temp_path, true, false);
4622 rhaas 3198 ECB :
4622 rhaas 3199 CBC 51 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
4382 bruce 3200 51 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
4622 rhaas 3201 GIC 51 : RemovePgTempRelationFiles(temp_path);
3202 : }
3203 :
5789 tgl 3204 596 : FreeDir(spc_dir);
6675 tgl 3205 ECB :
3206 : /*
6385 bruce 3207 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3208 : * DataDir as well. However, that is *not* cleaned here because doing so
1306 tgl 3209 : * would create a race condition. It's done separately, earlier in
3210 : * postmaster startup.
6675 3211 : */
6675 tgl 3212 GIC 596 : }
6675 tgl 3213 ECB :
1955 andres 3214 : /*
1918 tgl 3215 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3216 : *
3217 : * If missing_ok is true, it's all right for the named directory to not exist.
3218 : * Any other problem results in a LOG message. (missing_ok should be true at
3219 : * the top level, since pgsql_tmp directories are not created until needed.)
3220 : *
3221 : * At the top level, this should be called with unlink_all = false, so that
1955 andres 3222 : * only files matching the temporary name prefix will be unlinked. When
3223 : * recursing it will be called with unlink_all = true to unlink everything
3224 : * under a top-level temporary directory.
3225 : *
3226 : * (These two flags could be replaced by one, but it seems clearer to keep
3227 : * them separate.)
3228 : */
3229 : void
1918 tgl 3230 CBC 648 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3231 : {
3232 : DIR *temp_dir;
3233 : struct dirent *temp_de;
3234 : char rm_path[MAXPGPATH * 2];
3235 :
6675 tgl 3236 GIC 648 : temp_dir = AllocateDir(tmpdirname);
3237 :
1918 3238 648 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3239 603 : return;
3240 :
1952 3241 138 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3242 : {
6675 3243 93 : if (strcmp(temp_de->d_name, ".") == 0 ||
3244 48 : strcmp(temp_de->d_name, "..") == 0)
3245 90 : continue;
3246 :
3247 3 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
6675 tgl 3248 CBC 3 : tmpdirname, temp_de->d_name);
3249 :
1955 andres 3250 GIC 3 : if (unlink_all ||
3251 3 : strncmp(temp_de->d_name,
3252 : PG_TEMP_FILE_PREFIX,
3253 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
1955 andres 3254 CBC 3 : {
219 michael 3255 GNC 3 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
1955 andres 3256 ECB :
219 michael 3257 GNC 3 : if (type == PGFILETYPE_ERROR)
1955 andres 3258 LBC 0 : continue;
219 michael 3259 GNC 3 : else if (type == PGFILETYPE_DIR)
1955 andres 3260 ECB : {
3261 : /* recursively remove contents, then directory itself */
1918 tgl 3262 CBC 1 : RemovePgTempFilesInDir(rm_path, false, true);
1952 tgl 3263 ECB :
1952 tgl 3264 GIC 1 : if (rmdir(rm_path) < 0)
1952 tgl 3265 UIC 0 : ereport(LOG,
1952 tgl 3266 ECB : (errcode_for_file_access(),
3267 : errmsg("could not remove directory \"%s\": %m",
3268 : rm_path)));
1955 andres 3269 : }
1955 andres 3270 EUB : else
1952 tgl 3271 ECB : {
1952 tgl 3272 GIC 2 : if (unlink(rm_path) < 0)
1952 tgl 3273 UIC 0 : ereport(LOG,
1952 tgl 3274 ECB : (errcode_for_file_access(),
3275 : errmsg("could not remove file \"%s\": %m",
3276 : rm_path)));
1952 tgl 3277 EUB : }
3278 : }
3279 : else
1952 tgl 3280 UIC 0 : ereport(LOG,
3281 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3282 : rm_path)));
3283 : }
6675 tgl 3284 ECB :
6675 tgl 3285 GBC 45 : FreeDir(temp_dir);
3286 : }
3287 :
3288 : /* Process one tablespace directory, look for per-DB subdirectories */
3289 : static void
4622 rhaas 3290 GIC 647 : RemovePgTempRelationFiles(const char *tsdirname)
3291 : {
4622 rhaas 3292 EUB : DIR *ts_dir;
3293 : struct dirent *de;
3294 : char dbspace_path[MAXPGPATH * 2];
3295 :
4622 rhaas 3296 GIC 647 : ts_dir = AllocateDir(tsdirname);
4622 rhaas 3297 ECB :
1952 tgl 3298 GIC 4011 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3299 : {
3300 : /*
3301 : * We're only interested in the per-database directories, which have
4622 rhaas 3302 ECB : * numeric names. Note that this code will also (properly) ignore "."
3303 : * and "..".
3304 : */
1952 tgl 3305 GIC 3364 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
4622 rhaas 3306 1338 : continue;
3307 :
4622 rhaas 3308 CBC 2026 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
4622 rhaas 3309 GIC 2026 : tsdirname, de->d_name);
4622 rhaas 3310 CBC 2026 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3311 : }
3312 :
4622 rhaas 3313 GIC 647 : FreeDir(ts_dir);
3314 647 : }
3315 :
3316 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
4622 rhaas 3317 ECB : static void
4622 rhaas 3318 CBC 2026 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3319 : {
4622 rhaas 3320 ECB : DIR *dbspace_dir;
3321 : struct dirent *de;
2189 peter_e 3322 : char rm_path[MAXPGPATH * 2];
3323 :
4622 rhaas 3324 GIC 2026 : dbspace_dir = AllocateDir(dbspacedirname);
4622 rhaas 3325 ECB :
1952 tgl 3326 CBC 607643 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3327 : {
4622 rhaas 3328 GIC 605617 : if (!looks_like_temp_rel_name(de->d_name))
3329 605608 : continue;
4622 rhaas 3330 ECB :
4622 rhaas 3331 GIC 9 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3332 9 : dbspacedirname, de->d_name);
3333 :
1952 tgl 3334 9 : if (unlink(rm_path) < 0)
1952 tgl 3335 UIC 0 : ereport(LOG,
1952 tgl 3336 ECB : (errcode_for_file_access(),
3337 : errmsg("could not remove file \"%s\": %m",
3338 : rm_path)));
3339 : }
4622 rhaas 3340 :
4622 rhaas 3341 CBC 2026 : FreeDir(dbspace_dir);
4622 rhaas 3342 GIC 2026 : }
4622 rhaas 3343 ECB :
3344 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3345 : bool
4622 rhaas 3346 CBC 831248 : looks_like_temp_rel_name(const char *name)
4622 rhaas 3347 EUB : {
3348 : int pos;
3349 : int savepos;
3350 :
3351 : /* Must start with "t". */
4622 rhaas 3352 GIC 831248 : if (name[0] != 't')
4622 rhaas 3353 CBC 831199 : return false;
4622 rhaas 3354 ECB :
3355 : /* Followed by a non-empty string of digits and then an underscore. */
4622 rhaas 3356 GIC 225 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3357 : ;
4622 rhaas 3358 CBC 49 : if (pos == 1 || name[pos] != '_')
4622 rhaas 3359 UIC 0 : return false;
3360 :
3361 : /* Followed by another nonempty string of digits. */
4622 rhaas 3362 GIC 250 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3363 : ;
4622 rhaas 3364 CBC 49 : if (savepos == pos)
4622 rhaas 3365 LBC 0 : return false;
3366 :
3367 : /* We might have _forkname or .segment or both. */
4622 rhaas 3368 CBC 49 : if (name[pos] == '_')
3369 : {
4382 bruce 3370 22 : int forkchar = forkname_chars(&name[pos + 1], NULL);
4382 bruce 3371 EUB :
4622 rhaas 3372 GIC 22 : if (forkchar <= 0)
4622 rhaas 3373 UIC 0 : return false;
4622 rhaas 3374 CBC 22 : pos += forkchar + 1;
3375 : }
3376 49 : if (name[pos] == '.')
4622 rhaas 3377 EUB : {
3378 : int segchar;
3379 :
4382 bruce 3380 CBC 44 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3381 : ;
4622 rhaas 3382 22 : if (segchar <= 1)
4622 rhaas 3383 UIC 0 : return false;
4622 rhaas 3384 CBC 22 : pos += segchar;
4622 rhaas 3385 EUB : }
4622 rhaas 3386 ECB :
3387 : /* Now we should be at the end. */
4622 rhaas 3388 CBC 49 : if (name[pos] != '\0')
4622 rhaas 3389 UIC 0 : return false;
4622 rhaas 3390 GIC 49 : return true;
3391 : }
2897 rhaas 3392 ECB :
3393 : #ifdef HAVE_SYNCFS
750 tmunro 3394 : static void
750 tmunro 3395 UBC 0 : do_syncfs(const char *path)
750 tmunro 3396 ECB : {
3397 : int fd;
3398 :
531 rhaas 3399 UIC 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
531 rhaas 3400 ECB : path);
531 rhaas 3401 EUB :
750 tmunro 3402 LBC 0 : fd = OpenTransientFile(path, O_RDONLY);
750 tmunro 3403 UIC 0 : if (fd < 0)
3404 : {
3405 0 : ereport(LOG,
3406 : (errcode_for_file_access(),
650 peter 3407 EUB : errmsg("could not open file \"%s\": %m", path)));
750 tmunro 3408 UIC 0 : return;
3409 : }
3410 0 : if (syncfs(fd) < 0)
750 tmunro 3411 UBC 0 : ereport(LOG,
3412 : (errcode_for_file_access(),
3413 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3414 0 : CloseTransientFile(fd);
750 tmunro 3415 EUB : }
3416 : #endif
2873 tgl 3417 :
3418 : /*
3419 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
750 tmunro 3420 : * all potential filesystem, depending on recovery_init_sync_method setting.
3421 : *
2873 tgl 3422 : * We fsync regular files and directories wherever they are, but we
2362 rhaas 3423 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3424 : * Other symlinks are presumed to point at files we're not responsible
3425 : * for fsyncing, and might not have privileges to write at all.
2897 3426 : *
3427 : * Errors are logged but not considered fatal; that's because this is used
3428 : * only during database startup, to deal with the possibility that there are
3429 : * issued-but-unsynced writes pending against the data directory. We want to
3430 : * ensure that such writes reach disk before anything that's done in the new
3431 : * run. However, aborting on error would result in failure to start for
3432 : * harmless cases such as read-only files in the data directory, and that's
3433 : * not good either.
3434 : *
3435 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3436 : * rewriting all changes again during recovery.
3437 : *
3438 : * Note we assume we're chdir'd into PGDATA to begin with.
3439 : */
3440 : void
2873 tgl 3441 GIC 131 : SyncDataDirectory(void)
3442 : {
3443 : bool xlog_is_symlink;
3444 :
3445 : /* We can skip this whole thing if fsync is disabled. */
3446 131 : if (!enableFsync)
3447 131 : return;
3448 :
3449 : /*
3450 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3451 : * because the first walkdir below will ignore it.
3452 : */
2873 tgl 3453 LBC 0 : xlog_is_symlink = false;
3454 :
3455 : {
3456 : struct stat st;
2897 rhaas 3457 ECB :
2362 rhaas 3458 LBC 0 : if (lstat("pg_wal", &st) < 0)
2873 tgl 3459 UIC 0 : ereport(LOG,
3460 : (errcode_for_file_access(),
3461 : errmsg("could not stat file \"%s\": %m",
3462 : "pg_wal")));
3463 0 : else if (S_ISLNK(st.st_mode))
2873 tgl 3464 UBC 0 : xlog_is_symlink = true;
3465 : }
2873 tgl 3466 EUB :
3467 : #ifdef HAVE_SYNCFS
750 tmunro 3468 UIC 0 : if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
3469 : {
750 tmunro 3470 EUB : DIR *dir;
3471 : struct dirent *de;
3472 :
3473 : /*
3474 : * On Linux, we don't have to open every single file one by one. We
3475 : * can use syncfs() to sync whole filesystems. We only expect
3476 : * filesystem boundaries to exist where we tolerate symlinks, namely
3477 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3478 : * directories.
3479 : */
3480 :
3481 : /* Prepare to report progress syncing the data directory via syncfs. */
531 rhaas 3482 UIC 0 : begin_startup_progress_phase();
3483 :
3484 : /* Sync the top level pgdata directory. */
750 tmunro 3485 0 : do_syncfs(".");
3486 : /* If any tablespaces are configured, sync each of those. */
3487 0 : dir = AllocateDir("pg_tblspc");
3488 0 : while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
750 tmunro 3489 EUB : {
3490 : char path[MAXPGPATH];
3491 :
750 tmunro 3492 UBC 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
750 tmunro 3493 UIC 0 : continue;
750 tmunro 3494 EUB :
750 tmunro 3495 UBC 0 : snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
750 tmunro 3496 UIC 0 : do_syncfs(path);
3497 : }
3498 0 : FreeDir(dir);
750 tmunro 3499 EUB : /* If pg_wal is a symlink, process that too. */
750 tmunro 3500 UBC 0 : if (xlog_is_symlink)
750 tmunro 3501 UIC 0 : do_syncfs("pg_wal");
750 tmunro 3502 UBC 0 : return;
750 tmunro 3503 EUB : }
3504 : #endif /* !HAVE_SYNCFS */
3505 :
3506 : #ifdef PG_FLUSH_DATA_WORKS
531 rhaas 3507 : /* Prepare to report progress of the pre-fsync phase. */
531 rhaas 3508 UBC 0 : begin_startup_progress_phase();
531 rhaas 3509 EUB :
3510 : /*
3511 : * If possible, hint to the kernel that we're soon going to fsync the data
3512 : * directory and its contents. Errors in this step are even less
3513 : * interesting than normal, so log them only at DEBUG1.
3514 : */
2873 tgl 3515 UBC 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
2873 tgl 3516 UIC 0 : if (xlog_is_symlink)
2362 rhaas 3517 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
2873 tgl 3518 0 : walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3519 : #endif
3520 :
3521 : /* Prepare to report progress syncing the data directory via fsync. */
531 rhaas 3522 UBC 0 : begin_startup_progress_phase();
531 rhaas 3523 EUB :
2873 tgl 3524 : /*
3525 : * Now we do the fsync()s in the same order.
3526 : *
3527 : * The main call ignores symlinks, so in addition to specially processing
3528 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3529 : * process_symlinks = true. Note that if there are any plain directories
3530 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3531 : * so we don't worry about optimizing it.
3532 : */
2587 andres 3533 UIC 0 : walkdir(".", datadir_fsync_fname, false, LOG);
2873 tgl 3534 0 : if (xlog_is_symlink)
2362 rhaas 3535 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
2587 andres 3536 0 : walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3537 : }
3538 :
3539 : /*
2897 rhaas 3540 EUB : * walkdir: recursively walk a directory, applying the action to each
2873 tgl 3541 : * regular file and directory (including the named directory itself).
3542 : *
3543 : * If process_symlinks is true, the action and recursion are also applied
3544 : * to regular files and directories that are pointed to by symlinks in the
3545 : * given directory; otherwise symlinks are ignored. Symlinks are always
3546 : * ignored in subdirectories, ie we intentionally don't pass down the
3547 : * process_symlinks flag to recursive calls.
3548 : *
3549 : * Errors are reported at level elevel, which might be ERROR or less.
3550 : *
3551 : * See also walkdir in file_utils.c, which is a frontend version of this
3552 : * logic.
3553 : */
3554 : static void
2873 tgl 3555 GIC 162 : walkdir(const char *path,
3556 : void (*action) (const char *fname, bool isdir, int elevel),
3557 : bool process_symlinks,
3558 : int elevel)
3559 : {
3560 : DIR *dir;
3561 : struct dirent *de;
2897 rhaas 3562 ECB :
2897 rhaas 3563 GIC 162 : dir = AllocateDir(path);
3564 :
2873 tgl 3565 1886 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3566 : {
3567 : char subpath[MAXPGPATH * 2];
3568 :
2897 rhaas 3569 1724 : CHECK_FOR_INTERRUPTS();
2897 rhaas 3570 ECB :
2897 rhaas 3571 GIC 1724 : if (strcmp(de->d_name, ".") == 0 ||
2897 rhaas 3572 CBC 1562 : strcmp(de->d_name, "..") == 0)
2897 rhaas 3573 GIC 324 : continue;
3574 :
2189 peter_e 3575 1400 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
2897 rhaas 3576 ECB :
944 tmunro 3577 GIC 1400 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
2873 tgl 3578 ECB : {
944 tmunro 3579 CBC 1400 : case PGFILETYPE_REG:
3580 1400 : (*action) (subpath, false, elevel);
944 tmunro 3581 GIC 1400 : break;
944 tmunro 3582 LBC 0 : case PGFILETYPE_DIR:
944 tmunro 3583 UIC 0 : walkdir(subpath, action, false, elevel);
944 tmunro 3584 LBC 0 : break;
944 tmunro 3585 UIC 0 : default:
2897 rhaas 3586 ECB :
944 tmunro 3587 : /*
3588 : * Errors are already reported directly by get_dirent_type(),
944 tmunro 3589 EUB : * and any remaining symlinks and unknown file types are
3590 : * ignored.
3591 : */
944 tmunro 3592 UBC 0 : break;
3593 : }
3594 : }
3595 :
2873 tgl 3596 GIC 162 : FreeDir(dir); /* we ignore any error here */
3597 :
3598 : /*
2873 tgl 3599 EUB : * It's important to fsync the destination directory itself as individual
3600 : * file fsyncs don't guarantee that the directory entry for the file is
3601 : * synced. However, skip this if AllocateDir failed; the action function
3602 : * might not be robust against that.
2873 tgl 3603 ECB : */
1952 tgl 3604 GIC 162 : if (dir)
3605 162 : (*action) (path, true, elevel);
2873 3606 162 : }
3607 :
3608 :
3609 : /*
3610 : * Hint to the OS that it should get ready to fsync() this file.
2873 tgl 3611 ECB : *
3612 : * Ignores errors trying to open unreadable files, and logs other errors at a
3613 : * caller-specified level.
3614 : */
3615 : #ifdef PG_FLUSH_DATA_WORKS
3616 :
3617 : static void
2873 tgl 3618 UIC 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3619 : {
3620 : int fd;
3621 :
3622 : /* Don't try to flush directories, it'll likely just fail */
2552 3623 0 : if (isdir)
3624 0 : return;
2552 tgl 3625 EUB :
531 rhaas 3626 UIC 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3627 : fname);
3628 :
2024 peter_e 3629 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
2873 tgl 3630 EUB :
2873 tgl 3631 UBC 0 : if (fd < 0)
3632 : {
2552 3633 0 : if (errno == EACCES)
2873 tgl 3634 UIC 0 : return;
3635 0 : ereport(elevel,
2873 tgl 3636 EUB : (errcode_for_file_access(),
3637 : errmsg("could not open file \"%s\": %m", fname)));
2873 tgl 3638 UBC 0 : return;
3639 : }
2897 rhaas 3640 EUB :
2872 tgl 3641 : /*
2606 andres 3642 : * pg_flush_data() ignores errors, which is ok because this is only a
3643 : * hint.
3644 : */
2606 andres 3645 UBC 0 : pg_flush_data(fd, 0, 0);
3646 :
1373 peter 3647 UIC 0 : if (CloseTransientFile(fd) != 0)
1492 michael 3648 0 : ereport(elevel,
3649 : (errcode_for_file_access(),
3650 : errmsg("could not close file \"%s\": %m", fname)));
3651 : }
2873 tgl 3652 EUB :
3653 : #endif /* PG_FLUSH_DATA_WORKS */
3654 :
2587 andres 3655 : static void
2587 andres 3656 UIC 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3657 : {
531 rhaas 3658 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3659 : fname);
3660 :
3661 : /*
3662 : * We want to silently ignoring errors about unreadable files. Pass that
2587 andres 3663 EUB : * desire on to fsync_fname_ext().
3664 : */
2587 andres 3665 UBC 0 : fsync_fname_ext(fname, isdir, true, elevel);
2587 andres 3666 UIC 0 : }
3667 :
3668 : static void
1955 andres 3669 GIC 1562 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3670 : {
3671 1562 : if (isdir)
1955 andres 3672 EUB : {
1955 andres 3673 GBC 162 : if (rmdir(fname) != 0 && errno != ENOENT)
1955 andres 3674 UIC 0 : ereport(elevel,
3675 : (errcode_for_file_access(),
1294 peter 3676 ECB : errmsg("could not remove directory \"%s\": %m", fname)));
3677 : }
1955 andres 3678 : else
3679 : {
3680 : /* Use PathNameDeleteTemporaryFile to report filesize */
1955 andres 3681 GBC 1400 : PathNameDeleteTemporaryFile(fname, false);
3682 : }
1955 andres 3683 GIC 1562 : }
3684 :
3685 : /*
3686 : * fsync_fname_ext -- Try to fsync a file or directory
3687 : *
2587 andres 3688 ECB : * If ignore_perm is true, ignore errors upon trying to open unreadable
3689 : * files. Logs other errors at a caller-specified level.
3690 : *
3691 : * Returns 0 if the operation succeeded, -1 otherwise.
3692 : */
3693 : int
2587 andres 3694 GIC 34108 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3695 : {
3696 : int fd;
3697 : int flags;
3698 : int returncode;
3699 :
3700 : /*
2873 tgl 3701 ECB : * Some OSs require directories to be opened read-only whereas other
3702 : * systems don't allow us to fsync files opened read-only; so we need both
3703 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3704 : * not writable by our userid, but we assume that's OK.
3705 : */
2873 tgl 3706 GIC 34108 : flags = PG_BINARY;
3707 34108 : if (!isdir)
3708 11203 : flags |= O_RDWR;
3709 : else
3710 22905 : flags |= O_RDONLY;
3711 :
2024 peter_e 3712 34108 : fd = OpenTransientFile(fname, flags);
2587 andres 3713 ECB :
2873 tgl 3714 : /*
2587 andres 3715 : * Some OSs don't allow us to open directories at all (Windows returns
3716 : * EACCES), just ignore the error in that case. If desired also silently
3717 : * ignoring errors about unreadable files. Log others.
3718 : */
2587 andres 3719 CBC 34108 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
2587 andres 3720 UIC 0 : return 0;
2587 andres 3721 GIC 34108 : else if (fd < 0 && ignore_perm && errno == EACCES)
2587 andres 3722 UIC 0 : return 0;
2587 andres 3723 GIC 34108 : else if (fd < 0)
3724 : {
2873 tgl 3725 UIC 0 : ereport(elevel,
2873 tgl 3726 ECB : (errcode_for_file_access(),
2873 tgl 3727 EUB : errmsg("could not open file \"%s\": %m", fname)));
2587 andres 3728 LBC 0 : return -1;
2897 rhaas 3729 EUB : }
2897 rhaas 3730 ECB :
2873 tgl 3731 GIC 34108 : returncode = pg_fsync(fd);
2873 tgl 3732 EUB :
3733 : /*
3734 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3735 : * those errors. Anything else needs to be logged.
3736 : */
1505 tmunro 3737 GIC 34108 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
2587 andres 3738 ECB : {
3739 : int save_errno;
3740 :
3741 : /* close file upon error, might not be in transaction context */
2587 andres 3742 UIC 0 : save_errno = errno;
3743 0 : (void) CloseTransientFile(fd);
2587 andres 3744 LBC 0 : errno = save_errno;
3745 :
2873 tgl 3746 UIC 0 : ereport(elevel,
3747 : (errcode_for_file_access(),
3748 : errmsg("could not fsync file \"%s\": %m", fname)));
2587 andres 3749 UBC 0 : return -1;
2587 andres 3750 EUB : }
2873 tgl 3751 :
1373 peter 3752 GIC 34108 : if (CloseTransientFile(fd) != 0)
1492 michael 3753 EUB : {
1492 michael 3754 UIC 0 : ereport(elevel,
3755 : (errcode_for_file_access(),
1492 michael 3756 EUB : errmsg("could not close file \"%s\": %m", fname)));
1492 michael 3757 UIC 0 : return -1;
3758 : }
2587 andres 3759 ECB :
2587 andres 3760 GIC 34108 : return 0;
2587 andres 3761 EUB : }
3762 :
3763 : /*
3764 : * fsync_parent_path -- fsync the parent path of a file or directory
3765 : *
3766 : * This is aimed at making file operations persistent on disk in case of
2587 andres 3767 ECB : * an OS crash or power failure.
3768 : */
3769 : static int
2587 andres 3770 GIC 4809 : fsync_parent_path(const char *fname, int elevel)
3771 : {
3772 : char parentpath[MAXPGPATH];
3773 :
3774 4809 : strlcpy(parentpath, fname, MAXPGPATH);
3775 4809 : get_parent_directory(parentpath);
3776 :
2587 andres 3777 ECB : /*
3778 : * get_parent_directory() returns an empty string if the input argument is
3779 : * just a file name (see comments in path.c), so handle that as being the
3780 : * current directory.
3781 : */
2587 andres 3782 CBC 4809 : if (strlen(parentpath) == 0)
2587 andres 3783 GIC 151 : strlcpy(parentpath, ".", MAXPGPATH);
3784 :
3785 4809 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
2587 andres 3786 UIC 0 : return -1;
3787 :
2587 andres 3788 GIC 4809 : return 0;
2897 rhaas 3789 ECB : }
1828 sfrost 3790 :
3791 : /*
3792 : * Create a PostgreSQL data sub-directory
1828 sfrost 3793 EUB : *
3794 : * The data directory itself, and most of its sub-directories, are created at
1782 tgl 3795 ECB : * initdb time, but we do have some occasions when we create directories in
3796 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3797 : * make sure that those directories are created consistently. Today, that means
3798 : * making sure that the created directory has the correct permissions, which is
3799 : * what pg_dir_create_mode tracks for us.
3800 : *
3801 : * Note that we also set the umask() based on what we understand the correct
3802 : * permissions to be (see file_perm.c).
3803 : *
3804 : * For permissions other than the default, mkdir() can be used directly, but
3805 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3806 : * permissions in a PostgreSQL data directory could cause backups and other
3807 : * processes to fail.
3808 : */
3809 : int
1828 sfrost 3810 GIC 1638 : MakePGDirectory(const char *directoryName)
3811 : {
3812 1638 : return mkdir(directoryName, pg_dir_create_mode);
3813 : }
3814 :
3815 : /*
3816 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
1602 tmunro 3817 ECB : *
3818 : * Failure to fsync any data file is cause for immediate panic, unless
3819 : * data_sync_retry is enabled. Data may have been written to the operating
3820 : * system and removed from our buffer pool already, and if we are running on
3821 : * an operating system that forgets dirty data on write-back failure, there
3822 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3823 : * fsync again might falsely report success. Therefore we must not allow any
3824 : * further checkpoints to be attempted. data_sync_retry can in theory be
3825 : * enabled on systems known not to drop dirty buffered data on write-back
3826 : * failure (with the likely outcome that checkpoints will continue to fail
3827 : * until the underlying problem is fixed).
3828 : *
3829 : * Any code that reports a failure from fsync() or related functions should
3830 : * filter the error level with this function.
3831 : */
3832 : int
1602 tmunro 3833 GIC 19861 : data_sync_elevel(int elevel)
3834 : {
3835 19861 : return data_sync_retry ? elevel : PANIC;
3836 : }
3837 :
3838 : bool
1 tmunro 3839 GNC 1859 : check_io_direct(char **newval, void **extra, GucSource source)
3840 : {
3841 1859 : bool result = true;
3842 : int flags;
1 tmunro 3843 ECB :
3844 : #if PG_O_DIRECT == 0
3845 : if (strcmp(*newval, "") != 0)
3846 : {
3847 : GUC_check_errdetail("io_direct is not supported on this platform.");
3848 : result = false;
3849 : }
3850 : flags = 0;
3851 : #else
3852 : List *elemlist;
3853 : ListCell *l;
3854 : char *rawstring;
3855 :
3856 : /* Need a modifiable copy of string */
1 tmunro 3857 GNC 1859 : rawstring = pstrdup(*newval);
3858 :
3859 1859 : if (!SplitGUCList(rawstring, ',', &elemlist))
3860 : {
1 tmunro 3861 UNC 0 : GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
3862 : "io_direct");
3863 0 : pfree(rawstring);
3864 0 : list_free(elemlist);
3865 0 : return false;
3866 : }
3867 :
1 tmunro 3868 GNC 1859 : flags = 0;
3869 1865 : foreach(l, elemlist)
3870 : {
3871 6 : char *item = (char *) lfirst(l);
3872 :
3873 6 : if (pg_strcasecmp(item, "data") == 0)
3874 2 : flags |= IO_DIRECT_DATA;
3875 4 : else if (pg_strcasecmp(item, "wal") == 0)
3876 2 : flags |= IO_DIRECT_WAL;
3877 2 : else if (pg_strcasecmp(item, "wal_init") == 0)
3878 2 : flags |= IO_DIRECT_WAL_INIT;
3879 : else
3880 : {
1 tmunro 3881 UNC 0 : GUC_check_errdetail("invalid option \"%s\"", item);
3882 0 : result = false;
1 tmunro 3883 LBC 0 : break;
1 tmunro 3884 ECB : }
3885 : }
3886 :
3887 : /*
3888 : * It's possible to configure block sizes smaller than our assumed I/O
3889 : * alignment size, which could result in invalid I/O requests.
3890 : */
3891 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
3892 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
3893 : {
3894 : GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
3895 : result = false;
3896 : }
3897 : #endif
3898 : #if BLCKSZ < PG_IO_ALIGN_SIZE
3899 : if (result && (flags & IO_DIRECT_DATA))
3900 : {
3901 : GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
3902 : result = false;
3903 : }
3904 : #endif
3905 :
1 tmunro 3906 GNC 1859 : pfree(rawstring);
3907 1859 : list_free(elemlist);
3908 : #endif
3909 :
3910 1859 : if (!result)
1 tmunro 3911 UNC 0 : return result;
3912 :
3913 : /* Save the flags in *extra, for use by assign_io_direct */
1 tmunro 3914 GNC 1859 : *extra = guc_malloc(ERROR, sizeof(int));
3915 1859 : *((int *) *extra) = flags;
3916 :
3917 1859 : return result;
3918 : }
3919 :
3920 : extern void
3921 1859 : assign_io_direct(const char *newval, void *extra)
3922 : {
3923 1859 : int *flags = (int *) extra;
3924 :
3925 1859 : io_direct_flags = *flags;
1 tmunro 3926 GIC 1859 : }
|