LCOV - differential code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 72.7 % 971 706 18 41 174 32 30 494 50 132 193 500 10 33
Current Date: 2023-04-08 15:15:32 Functions: 90.2 % 92 83 1 8 76 7 8 78 5
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * fd.c
       4                 :  *    Virtual file descriptor code.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  * IDENTIFICATION
      10                 :  *    src/backend/storage/file/fd.c
      11                 :  *
      12                 :  * NOTES:
      13                 :  *
      14                 :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15                 :  * The server opens many file descriptors for a variety of reasons,
      16                 :  * including base tables, scratch files (e.g., sort and hash spool
      17                 :  * files), and random calls to C library routines like system(3); it
      18                 :  * is quite easy to exceed system limits on the number of open files a
      19                 :  * single process can have.  (This is around 1024 on many modern
      20                 :  * operating systems, but may be lower on others.)
      21                 :  *
      22                 :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23                 :  * being opened and closed as needed.  Obviously, if a routine is
      24                 :  * opened using these interfaces, all subsequent operations must also
      25                 :  * be through these interfaces (the File type is not a real file
      26                 :  * descriptor).
      27                 :  *
      28                 :  * For this scheme to work, most (if not all) routines throughout the
      29                 :  * server should use these interfaces instead of calling the C library
      30                 :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31                 :  * may find ourselves short of real file descriptors anyway.
      32                 :  *
      33                 :  * INTERFACE ROUTINES
      34                 :  *
      35                 :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36                 :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37                 :  * File is closed, either explicitly or implicitly at end of transaction or
      38                 :  * process exit. PathNameOpenFile is intended for files that are held open
      39                 :  * for a long time, like relation files. It is the caller's responsibility
      40                 :  * to close them, there is no automatic mechanism in fd.c for that.
      41                 :  *
      42                 :  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
      43                 :  * temporary files that have names so that they can be shared between
      44                 :  * backends.  Such files are automatically closed and count against the
      45                 :  * temporary file limit of the backend that creates them, but unlike anonymous
      46                 :  * files they are not automatically deleted.  See sharedfileset.c for a shared
      47                 :  * ownership mechanism that provides automatic cleanup for shared files when
      48                 :  * the last of a group of backends detaches.
      49                 :  *
      50                 :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      51                 :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      52                 :  * They behave like the corresponding native functions, except that the handle
      53                 :  * is registered with the current subtransaction, and will be automatically
      54                 :  * closed at abort. These are intended mainly for short operations like
      55                 :  * reading a configuration file; there is a limit on the number of files that
      56                 :  * can be opened using these functions at any one time.
      57                 :  *
      58                 :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      59                 :  * release file descriptors in use by the virtual file descriptors if
      60                 :  * necessary. There is no automatic cleanup of file descriptors returned by
      61                 :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      62                 :  * descriptor by calling close(2).
      63                 :  *
      64                 :  * If a non-virtual file descriptor needs to be held open for any length of
      65                 :  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
      66                 :  * (and eventually ReleaseExternalFD), so that we can take it into account
      67                 :  * while deciding how many VFDs can be open.  This applies to FDs obtained
      68                 :  * with BasicOpenFile as well as those obtained without use of any fd.c API.
      69                 :  *
      70                 :  *-------------------------------------------------------------------------
      71                 :  */
      72                 : 
      73                 : #include "postgres.h"
      74                 : 
      75                 : #include <dirent.h>
      76                 : #include <sys/file.h>
      77                 : #include <sys/param.h>
      78                 : #include <sys/resource.h>     /* for getrlimit */
      79                 : #include <sys/stat.h>
      80                 : #include <sys/types.h>
      81                 : #ifndef WIN32
      82                 : #include <sys/mman.h>
      83                 : #endif
      84                 : #include <limits.h>
      85                 : #include <unistd.h>
      86                 : #include <fcntl.h>
      87                 : 
      88                 : #include "access/xact.h"
      89                 : #include "access/xlog.h"
      90                 : #include "catalog/pg_tablespace.h"
      91                 : #include "common/file_perm.h"
      92                 : #include "common/file_utils.h"
      93                 : #include "common/pg_prng.h"
      94                 : #include "miscadmin.h"
      95                 : #include "pgstat.h"
      96                 : #include "portability/mem.h"
      97                 : #include "postmaster/startup.h"
      98                 : #include "storage/fd.h"
      99                 : #include "storage/ipc.h"
     100                 : #include "utils/guc.h"
     101                 : #include "utils/guc_hooks.h"
     102                 : #include "utils/resowner_private.h"
     103                 : #include "utils/varlena.h"
     104                 : 
     105                 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
     106                 : #if defined(HAVE_SYNC_FILE_RANGE)
     107                 : #define PG_FLUSH_DATA_WORKS 1
     108                 : #elif !defined(WIN32) && defined(MS_ASYNC)
     109                 : #define PG_FLUSH_DATA_WORKS 1
     110                 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     111                 : #define PG_FLUSH_DATA_WORKS 1
     112                 : #endif
     113                 : 
     114                 : /*
     115                 :  * We must leave some file descriptors free for system(), the dynamic loader,
     116                 :  * and other code that tries to open files without consulting fd.c.  This
     117                 :  * is the number left free.  (While we try fairly hard to prevent EMFILE
     118                 :  * errors, there's never any guarantee that we won't get ENFILE due to
     119                 :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     120                 :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     121                 :  *
     122                 :  * Because this is just a fixed setting, we are effectively assuming that
     123                 :  * no such code will leave FDs open over the long term; otherwise the slop
     124                 :  * is likely to be insufficient.  Note in particular that we expect that
     125                 :  * loading a shared library does not result in any permanent increase in
     126                 :  * the number of open files.  (This appears to be true on most if not
     127                 :  * all platforms as of Feb 2004.)
     128                 :  */
     129                 : #define NUM_RESERVED_FDS        10
     130                 : 
     131                 : /*
     132                 :  * If we have fewer than this many usable FDs after allowing for the reserved
     133                 :  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
     134                 :  * much less than that.  Note that this value ensures numExternalFDs can be
     135                 :  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
     136                 :  * will not pass unless that can grow to at least 14.)
     137                 :  */
     138                 : #define FD_MINFREE              48
     139                 : 
     140                 : /*
     141                 :  * A number of platforms allow individual processes to open many more files
     142                 :  * than they can really support when *many* processes do the same thing.
     143                 :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     144                 :  * what the postmaster's initial probe suggests will work.
     145                 :  */
     146                 : int         max_files_per_process = 1000;
     147                 : 
     148                 : /*
     149                 :  * Maximum number of file descriptors to open for operations that fd.c knows
     150                 :  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
     151                 :  * to a conservative value, and remains that way indefinitely in bootstrap or
     152                 :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     153                 :  * calls set_max_safe_fds() late in initialization to update the value, and
     154                 :  * that value is then inherited by forked subprocesses.
     155                 :  *
     156                 :  * Note: the value of max_files_per_process is taken into account while
     157                 :  * setting this variable, and so need not be tested separately.
     158                 :  */
     159                 : int         max_safe_fds = FD_MINFREE;  /* default if not changed */
     160                 : 
     161                 : /* Whether it is safe to continue running after fsync() fails. */
     162                 : bool        data_sync_retry = false;
     163                 : 
     164                 : /* How SyncDataDirectory() should do its job. */
     165                 : int         recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
     166                 : 
     167                 : /* Which kinds of files should be opened with PG_O_DIRECT. */
     168                 : int         io_direct_flags;
     169                 : 
     170                 : /* Debugging.... */
     171                 : 
     172                 : #ifdef FDDEBUG
     173                 : #define DO_DB(A) \
     174                 :     do { \
     175                 :         int         _do_db_save_errno = errno; \
     176                 :         A; \
     177                 :         errno = _do_db_save_errno; \
     178                 :     } while (0)
     179                 : #else
     180                 : #define DO_DB(A) \
     181                 :     ((void) 0)
     182                 : #endif
     183                 : 
     184                 : #define VFD_CLOSED (-1)
     185                 : 
     186                 : #define FileIsValid(file) \
     187                 :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     188                 : 
     189                 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     190                 : 
     191                 : /* these are the assigned bits in fdstate below: */
     192                 : #define FD_DELETE_AT_CLOSE  (1 << 0)  /* T = delete when closed */
     193                 : #define FD_CLOSE_AT_EOXACT  (1 << 1)  /* T = close at eoXact */
     194                 : #define FD_TEMP_FILE_LIMIT  (1 << 2)  /* T = respect temp_file_limit */
     195                 : 
     196                 : typedef struct vfd
     197                 : {
     198                 :     int         fd;             /* current FD, or VFD_CLOSED if none */
     199                 :     unsigned short fdstate;     /* bitflags for VFD's state */
     200                 :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     201                 :     File        nextFree;       /* link to next free VFD, if in freelist */
     202                 :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     203                 :     File        lruLessRecently;
     204                 :     off_t       fileSize;       /* current size of file (0 if not temporary) */
     205                 :     char       *fileName;       /* name of file, or NULL for unused VFD */
     206                 :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     207                 :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     208                 :     mode_t      fileMode;       /* mode to pass to open(2) */
     209                 : } Vfd;
     210                 : 
     211                 : /*
     212                 :  * Virtual File Descriptor array pointer and size.  This grows as
     213                 :  * needed.  'File' values are indexes into this array.
     214                 :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     215                 :  */
     216                 : static Vfd *VfdCache;
     217                 : static Size SizeVfdCache = 0;
     218                 : 
     219                 : /*
     220                 :  * Number of file descriptors known to be in use by VFD entries.
     221                 :  */
     222                 : static int  nfile = 0;
     223                 : 
     224                 : /*
     225                 :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     226                 :  * to close
     227                 :  */
     228                 : static bool have_xact_temporary_files = false;
     229                 : 
     230                 : /*
     231                 :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     232                 :  * is being enforced, this cannot overflow since the limit cannot be more
     233                 :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     234                 :  * overflow, but we don't care.
     235                 :  */
     236                 : static uint64 temporary_files_size = 0;
     237                 : 
     238                 : /* Temporary file access initialized and not yet shut down? */
     239                 : #ifdef USE_ASSERT_CHECKING
     240                 : static bool temporary_files_allowed = false;
     241                 : #endif
     242                 : 
     243                 : /*
     244                 :  * List of OS handles opened with AllocateFile, AllocateDir and
     245                 :  * OpenTransientFile.
     246                 :  */
     247                 : typedef enum
     248                 : {
     249                 :     AllocateDescFile,
     250                 :     AllocateDescPipe,
     251                 :     AllocateDescDir,
     252                 :     AllocateDescRawFD
     253                 : } AllocateDescKind;
     254                 : 
     255                 : typedef struct
     256                 : {
     257                 :     AllocateDescKind kind;
     258                 :     SubTransactionId create_subid;
     259                 :     union
     260                 :     {
     261                 :         FILE       *file;
     262                 :         DIR        *dir;
     263                 :         int         fd;
     264                 :     }           desc;
     265                 : } AllocateDesc;
     266                 : 
     267                 : static int  numAllocatedDescs = 0;
     268                 : static int  maxAllocatedDescs = 0;
     269                 : static AllocateDesc *allocatedDescs = NULL;
     270                 : 
     271                 : /*
     272                 :  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
     273                 :  */
     274                 : static int  numExternalFDs = 0;
     275                 : 
     276                 : /*
     277                 :  * Number of temporary files opened during the current session;
     278                 :  * this is used in generation of tempfile names.
     279                 :  */
     280                 : static long tempFileCounter = 0;
     281                 : 
     282                 : /*
     283                 :  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
     284                 :  * indicating that the current database's default tablespace should be used.)
     285                 :  * When numTempTableSpaces is -1, this has not been set in the current
     286                 :  * transaction.
     287                 :  */
     288                 : static Oid *tempTableSpaces = NULL;
     289                 : static int  numTempTableSpaces = -1;
     290                 : static int  nextTempTableSpace = 0;
     291                 : 
     292                 : 
     293                 : /*--------------------
     294                 :  *
     295                 :  * Private Routines
     296                 :  *
     297                 :  * Delete          - delete a file from the Lru ring
     298                 :  * LruDelete       - remove a file from the Lru ring and close its FD
     299                 :  * Insert          - put a file at the front of the Lru ring
     300                 :  * LruInsert       - put a file at the front of the Lru ring and open it
     301                 :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     302                 :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     303                 :  * AllocateVfd     - grab a free (or new) file record (from VfdCache)
     304                 :  * FreeVfd         - free a file record
     305                 :  *
     306                 :  * The Least Recently Used ring is a doubly linked list that begins and
     307                 :  * ends on element zero.  Element zero is special -- it doesn't represent
     308                 :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     309                 :  * anchor that shows us the beginning/end of the ring.
     310                 :  * Only VFD elements that are currently really open (have an FD assigned) are
     311                 :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     312                 :  * by having a non-null fileName field.
     313                 :  *
     314                 :  * example:
     315                 :  *
     316                 :  *     /--less----\                /---------\
     317                 :  *     v           \              v           \
     318                 :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     319                 :  *    ^\                                    | |
     320                 :  *     \\less--> MostRecentlyUsedFile    <---/ |
     321                 :  *      \more---/                    \--less--/
     322                 :  *
     323                 :  *--------------------
     324                 :  */
     325                 : static void Delete(File file);
     326                 : static void LruDelete(File file);
     327                 : static void Insert(File file);
     328                 : static int  LruInsert(File file);
     329                 : static bool ReleaseLruFile(void);
     330                 : static void ReleaseLruFiles(void);
     331                 : static File AllocateVfd(void);
     332                 : static void FreeVfd(File file);
     333                 : 
     334                 : static int  FileAccess(File file);
     335                 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     336                 : static bool reserveAllocatedDesc(void);
     337                 : static int  FreeDesc(AllocateDesc *desc);
     338                 : 
     339                 : static void BeforeShmemExit_Files(int code, Datum arg);
     340                 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
     341                 : static void RemovePgTempRelationFiles(const char *tsdirname);
     342                 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     343                 : 
     344                 : static void walkdir(const char *path,
     345                 :                     void (*action) (const char *fname, bool isdir, int elevel),
     346                 :                     bool process_symlinks,
     347                 :                     int elevel);
     348                 : #ifdef PG_FLUSH_DATA_WORKS
     349                 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     350                 : #endif
     351                 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     352                 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
     353                 : 
     354                 : static int  fsync_parent_path(const char *fname, int elevel);
     355                 : 
     356                 : 
     357                 : /*
     358                 :  * pg_fsync --- do fsync with or without writethrough
     359                 :  */
     360                 : int
     361 GIC      117698 : pg_fsync(int fd)
     362                 : {
     363 ECB             : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
     364                 :     struct stat st;
     365                 : 
     366                 :     /*
     367                 :      * Some operating system implementations of fsync() have requirements
     368                 :      * about the file access modes that were used when their file descriptor
     369                 :      * argument was opened, and these requirements differ depending on whether
     370                 :      * the file descriptor is for a directory.
     371                 :      *
     372                 :      * For any file descriptor that may eventually be handed to fsync(), we
     373                 :      * should have opened it with access modes that are compatible with
     374                 :      * fsync() on all supported systems, otherwise the code may not be
     375                 :      * portable, even if it runs ok on the current system.
     376                 :      *
     377                 :      * We assert here that a descriptor for a file was opened with write
     378                 :      * permissions (either O_RDWR or O_WRONLY) and for a directory without
     379                 :      * write permissions (O_RDONLY).
     380                 :      *
     381                 :      * Ignore any fstat errors and let the follow-up fsync() do its work.
     382                 :      * Doing this sanity check here counts for the case where fsync() is
     383                 :      * disabled.
     384                 :      */
     385 GIC      117698 :     if (fstat(fd, &st) == 0)
     386                 :     {
     387 CBC      117698 :         int         desc_flags = fcntl(fd, F_GETFL);
     388                 : 
     389 ECB             :         /*
     390                 :          * O_RDONLY is historically 0, so just make sure that for directories
     391                 :          * no write flags are used.
     392                 :          */
     393 GIC      117698 :         if (S_ISDIR(st.st_mode))
     394           22905 :             Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
     395 ECB             :         else
     396 CBC       94793 :             Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
     397                 :     }
     398          117698 :     errno = 0;
     399                 : #endif
     400 ECB             : 
     401                 :     /* #if is to skip the sync_method test if there's no need for it */
     402                 : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
     403                 :     if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
     404                 :         return pg_fsync_writethrough(fd);
     405                 :     else
     406                 : #endif
     407 GIC      117698 :         return pg_fsync_no_writethrough(fd);
     408                 : }
     409 ECB             : 
     410                 : 
     411                 : /*
     412                 :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     413                 :  *  enableFsync is off
     414                 :  */
     415                 : int
     416 GIC      117698 : pg_fsync_no_writethrough(int fd)
     417                 : {
     418 CBC      117698 :     if (enableFsync)
     419 UIC           0 :         return fsync(fd);
     420 ECB             :     else
     421 GBC      117698 :         return 0;
     422                 : }
     423 ECB             : 
     424                 : /*
     425                 :  * pg_fsync_writethrough
     426                 :  */
     427                 : int
     428 UIC           0 : pg_fsync_writethrough(int fd)
     429                 : {
     430 UBC           0 :     if (enableFsync)
     431                 :     {
     432 EUB             : #ifdef WIN32
     433                 :         return _commit(fd);
     434                 : #elif defined(F_FULLFSYNC)
     435                 :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     436                 : #else
     437 UIC           0 :         errno = ENOSYS;
     438               0 :         return -1;
     439 EUB             : #endif
     440                 :     }
     441                 :     else
     442 UIC           0 :         return 0;
     443                 : }
     444 EUB             : 
     445                 : /*
     446                 :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     447                 :  */
     448                 : int
     449 UBC           0 : pg_fdatasync(int fd)
     450                 : {
     451               0 :     if (enableFsync)
     452               0 :         return fdatasync(fd);
     453                 :     else
     454 UIC           0 :         return 0;
     455                 : }
     456                 : 
     457 ECB             : /*
     458                 :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     459                 :  *
     460                 :  * offset of 0 with nbytes 0 means that the entire file should be flushed
     461                 :  */
     462                 : void
     463 GIC      287494 : pg_flush_data(int fd, off_t offset, off_t nbytes)
     464                 : {
     465 ECB             :     /*
     466                 :      * Right now file flushing is primarily used to avoid making later
     467                 :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     468                 :      * if fsyncs are disabled - that's a decision we might want to make
     469                 :      * configurable at some point.
     470                 :      */
     471 GIC      287494 :     if (!enableFsync)
     472          287494 :         return;
     473                 : 
     474                 :     /*
     475                 :      * We compile all alternatives that are supported on the current platform,
     476                 :      * to find portability problems more easily.
     477 EUB             :      */
     478                 : #if defined(HAVE_SYNC_FILE_RANGE)
     479                 :     {
     480                 :         int         rc;
     481                 :         static bool not_implemented_by_kernel = false;
     482                 : 
     483 UIC           0 :         if (not_implemented_by_kernel)
     484               0 :             return;
     485                 : 
     486                 :         /*
     487                 :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     488                 :          * tells the OS that writeback for the specified blocks should be
     489 EUB             :          * started, but that we don't want to wait for completion.  Note that
     490                 :          * this call might block if too much dirty data exists in the range.
     491                 :          * This is the preferable method on OSs supporting it, as it works
     492                 :          * reliably when available (contrast to msync()) and doesn't flush out
     493                 :          * clean data (like FADV_DONTNEED).
     494                 :          */
     495 UIC           0 :         rc = sync_file_range(fd, offset, nbytes,
     496                 :                              SYNC_FILE_RANGE_WRITE);
     497               0 :         if (rc != 0)
     498                 :         {
     499                 :             int         elevel;
     500 EUB             : 
     501                 :             /*
     502                 :              * For systems that don't have an implementation of
     503                 :              * sync_file_range() such as Windows WSL, generate only one
     504                 :              * warning and then suppress all further attempts by this process.
     505                 :              */
     506 UBC           0 :             if (errno == ENOSYS)
     507                 :             {
     508               0 :                 elevel = WARNING;
     509 UIC           0 :                 not_implemented_by_kernel = true;
     510                 :             }
     511                 :             else
     512               0 :                 elevel = data_sync_elevel(WARNING);
     513 EUB             : 
     514 UIC           0 :             ereport(elevel,
     515                 :                     (errcode_for_file_access(),
     516                 :                      errmsg("could not flush dirty data: %m")));
     517                 :         }
     518                 : 
     519               0 :         return;
     520                 :     }
     521                 : #endif
     522                 : #if !defined(WIN32) && defined(MS_ASYNC)
     523                 :     {
     524                 :         void       *p;
     525                 :         static int  pagesize = 0;
     526                 : 
     527                 :         /*
     528                 :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     529                 :          * writeback. On linux it only does so if MS_SYNC is specified, but
     530                 :          * then it does the writeback synchronously. Luckily all common linux
     531                 :          * systems have sync_file_range().  This is preferable over
     532                 :          * FADV_DONTNEED because it doesn't flush out clean data.
     533                 :          *
     534                 :          * We map the file (mmap()), tell the kernel to sync back the contents
     535                 :          * (msync()), and then remove the mapping again (munmap()).
     536                 :          */
     537                 : 
     538                 :         /* mmap() needs actual length if we want to map whole file */
     539                 :         if (offset == 0 && nbytes == 0)
     540                 :         {
     541                 :             nbytes = lseek(fd, 0, SEEK_END);
     542                 :             if (nbytes < 0)
     543                 :             {
     544                 :                 ereport(WARNING,
     545                 :                         (errcode_for_file_access(),
     546                 :                          errmsg("could not determine dirty data size: %m")));
     547                 :                 return;
     548                 :             }
     549                 :         }
     550                 : 
     551                 :         /*
     552                 :          * Some platforms reject partial-page mmap() attempts.  To deal with
     553                 :          * that, just truncate the request to a page boundary.  If any extra
     554                 :          * bytes don't get flushed, well, it's only a hint anyway.
     555                 :          */
     556                 : 
     557                 :         /* fetch pagesize only once */
     558                 :         if (pagesize == 0)
     559                 :             pagesize = sysconf(_SC_PAGESIZE);
     560                 : 
     561                 :         /* align length to pagesize, dropping any fractional page */
     562                 :         if (pagesize > 0)
     563                 :             nbytes = (nbytes / pagesize) * pagesize;
     564                 : 
     565                 :         /* fractional-page request is a no-op */
     566                 :         if (nbytes <= 0)
     567                 :             return;
     568                 : 
     569                 :         /*
     570                 :          * mmap could well fail, particularly on 32-bit platforms where there
     571                 :          * may simply not be enough address space.  If so, silently fall
     572                 :          * through to the next implementation.
     573                 :          */
     574                 :         if (nbytes <= (off_t) SSIZE_MAX)
     575                 :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     576                 :         else
     577                 :             p = MAP_FAILED;
     578                 : 
     579                 :         if (p != MAP_FAILED)
     580                 :         {
     581                 :             int         rc;
     582                 : 
     583                 :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     584                 :             if (rc != 0)
     585                 :             {
     586                 :                 ereport(data_sync_elevel(WARNING),
     587                 :                         (errcode_for_file_access(),
     588                 :                          errmsg("could not flush dirty data: %m")));
     589                 :                 /* NB: need to fall through to munmap()! */
     590                 :             }
     591                 : 
     592                 :             rc = munmap(p, (size_t) nbytes);
     593                 :             if (rc != 0)
     594                 :             {
     595                 :                 /* FATAL error because mapping would remain */
     596                 :                 ereport(FATAL,
     597                 :                         (errcode_for_file_access(),
     598                 :                          errmsg("could not munmap() while flushing data: %m")));
     599                 :             }
     600                 : 
     601                 :             return;
     602                 :         }
     603                 :     }
     604                 : #endif
     605                 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     606                 :     {
     607                 :         int         rc;
     608                 : 
     609                 :         /*
     610                 :          * Signal the kernel that the passed in range should not be cached
     611                 :          * anymore. This has the, desired, side effect of writing out dirty
     612                 :          * data, and the, undesired, side effect of likely discarding useful
     613                 :          * clean cached blocks.  For the latter reason this is the least
     614                 :          * preferable method.
     615                 :          */
     616                 : 
     617                 :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     618                 : 
     619                 :         if (rc != 0)
     620                 :         {
     621                 :             /* don't error out, this is just a performance optimization */
     622                 :             ereport(WARNING,
     623                 :                     (errcode_for_file_access(),
     624                 :                      errmsg("could not flush dirty data: %m")));
     625                 :         }
     626                 : 
     627                 :         return;
     628                 :     }
     629                 : #endif
     630 ECB             : }
     631                 : 
     632                 : /*
     633                 :  * Truncate a file to a given length by name.
     634                 :  */
     635                 : int
     636 GIC      174303 : pg_truncate(const char *path, off_t length)
     637                 : {
     638                 : #ifdef WIN32
     639                 :     int         save_errno;
     640                 :     int         ret;
     641                 :     int         fd;
     642                 : 
     643                 :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
     644                 :     if (fd >= 0)
     645                 :     {
     646                 :         ret = ftruncate(fd, length);
     647                 :         save_errno = errno;
     648                 :         CloseTransientFile(fd);
     649                 :         errno = save_errno;
     650 ECB             :     }
     651                 :     else
     652                 :         ret = -1;
     653                 : 
     654                 :     return ret;
     655                 : #else
     656 GIC      174303 :     return truncate(path, length);
     657                 : #endif
     658                 : }
     659                 : 
     660                 : /*
     661 ECB             :  * fsync_fname -- fsync a file or directory, handling errors properly
     662                 :  *
     663                 :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     664                 :  * indicate the OS just doesn't allow/require fsyncing directories.
     665                 :  */
     666                 : void
     667 GIC       19861 : fsync_fname(const char *fname, bool isdir)
     668                 : {
     669           19861 :     fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
     670           19861 : }
     671                 : 
     672                 : /*
     673                 :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     674                 :  *
     675                 :  * This routine ensures that, after returning, the effect of renaming file
     676                 :  * persists in case of a crash. A crash while this routine is running will
     677                 :  * leave you with either the pre-existing or the moved file in place of the
     678                 :  * new file; no mixed state or truncated files are possible.
     679                 :  *
     680                 :  * It does so by using fsync on the old filename and the possibly existing
     681                 :  * target filename before the rename, and the target file and directory after.
     682                 :  *
     683                 :  * Note that rename() cannot be used across arbitrary directories, as they
     684                 :  * might not be on the same filesystem. Therefore this routine does not
     685                 :  * support renaming across directories.
     686                 :  *
     687 ECB             :  * Log errors with the caller specified severity.
     688                 :  *
     689                 :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     690                 :  * valid upon return.
     691                 :  */
     692                 : int
     693 GIC        4718 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     694                 : {
     695                 :     int         fd;
     696                 : 
     697                 :     /*
     698 ECB             :      * First fsync the old and target path (if it exists), to ensure that they
     699 EUB             :      * are properly persistent on disk. Syncing the target file is not
     700                 :      * strictly necessary, but it makes it easier to reason about crashes;
     701 ECB             :      * because it's then guaranteed that either source or target file exists
     702                 :      * after a crash.
     703                 :      */
     704 CBC        4718 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     705 UIC           0 :         return -1;
     706 EUB             : 
     707 GIC        4718 :     fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
     708            4718 :     if (fd < 0)
     709 EUB             :     {
     710 GIC        2507 :         if (errno != ENOENT)
     711                 :         {
     712 UIC           0 :             ereport(elevel,
     713                 :                     (errcode_for_file_access(),
     714 ECB             :                      errmsg("could not open file \"%s\": %m", newfile)));
     715 UIC           0 :             return -1;
     716                 :         }
     717                 :     }
     718                 :     else
     719 EUB             :     {
     720 GBC        2211 :         if (pg_fsync(fd) != 0)
     721 EUB             :         {
     722                 :             int         save_errno;
     723                 : 
     724                 :             /* close file upon error, might not be in transaction context */
     725 UIC           0 :             save_errno = errno;
     726 UBC           0 :             CloseTransientFile(fd);
     727 UIC           0 :             errno = save_errno;
     728                 : 
     729 LBC           0 :             ereport(elevel,
     730                 :                     (errcode_for_file_access(),
     731 EUB             :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     732 UIC           0 :             return -1;
     733                 :         }
     734 EUB             : 
     735 GIC        2211 :         if (CloseTransientFile(fd) != 0)
     736                 :         {
     737 UIC           0 :             ereport(elevel,
     738                 :                     (errcode_for_file_access(),
     739 ECB             :                      errmsg("could not close file \"%s\": %m", newfile)));
     740 UIC           0 :             return -1;
     741 EUB             :         }
     742                 :     }
     743                 : 
     744                 :     /* Time to do the real deal... */
     745 GBC        4718 :     if (rename(oldfile, newfile) < 0)
     746                 :     {
     747 UIC           0 :         ereport(elevel,
     748                 :                 (errcode_for_file_access(),
     749                 :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     750                 :                         oldfile, newfile)));
     751               0 :         return -1;
     752 ECB             :     }
     753 EUB             : 
     754                 :     /*
     755 ECB             :      * To guarantee renaming the file is persistent, fsync the file with its
     756 EUB             :      * new name, and its containing directory.
     757                 :      */
     758 CBC        4718 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     759 UIC           0 :         return -1;
     760                 : 
     761 GIC        4718 :     if (fsync_parent_path(newfile, elevel) != 0)
     762 UIC           0 :         return -1;
     763                 : 
     764 GIC        4718 :     return 0;
     765                 : }
     766                 : 
     767                 : /*
     768                 :  * durable_unlink -- remove a file in a durable manner
     769                 :  *
     770                 :  * This routine ensures that, after returning, the effect of removing file
     771                 :  * persists in case of a crash. A crash while this routine is running will
     772                 :  * leave the system in no mixed state.
     773                 :  *
     774                 :  * It does so by using fsync on the parent directory of the file after the
     775                 :  * actual removal is done.
     776                 :  *
     777 ECB             :  * Log errors with the severity specified by caller.
     778                 :  *
     779                 :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     780                 :  * valid upon return.
     781                 :  */
     782                 : int
     783 GIC         121 : durable_unlink(const char *fname, int elevel)
     784                 : {
     785 CBC         121 :     if (unlink(fname) < 0)
     786                 :     {
     787 GIC          30 :         ereport(elevel,
     788                 :                 (errcode_for_file_access(),
     789                 :                  errmsg("could not remove file \"%s\": %m",
     790                 :                         fname)));
     791              30 :         return -1;
     792 ECB             :     }
     793 EUB             : 
     794                 :     /*
     795 ECB             :      * To guarantee that the removal of the file is persistent, fsync its
     796                 :      * parent directory.
     797                 :      */
     798 GIC          91 :     if (fsync_parent_path(fname, elevel) != 0)
     799 UIC           0 :         return -1;
     800                 : 
     801 GIC          91 :     return 0;
     802                 : }
     803                 : 
     804                 : /*
     805                 :  * InitFileAccess --- initialize this module during backend startup
     806 ECB             :  *
     807                 :  * This is called during either normal or standalone backend start.
     808                 :  * It is *not* called in the postmaster.
     809                 :  *
     810                 :  * Note that this does not initialize temporary file access, that is
     811                 :  * separately initialized via InitTemporaryFileAccess().
     812                 :  */
     813                 : void
     814 GIC       13291 : InitFileAccess(void)
     815                 : {
     816           13291 :     Assert(SizeVfdCache == 0);  /* call me only once */
     817                 : 
     818                 :     /* initialize cache header entry */
     819 CBC       13291 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     820           13291 :     if (VfdCache == NULL)
     821 UIC           0 :         ereport(FATAL,
     822                 :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     823 ECB             :                  errmsg("out of memory")));
     824                 : 
     825 GBC      106328 :     MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
     826 GIC       13291 :     VfdCache->fd = VFD_CLOSED;
     827                 : 
     828           13291 :     SizeVfdCache = 1;
     829           13291 : }
     830 ECB             : 
     831                 : /*
     832                 :  * InitTemporaryFileAccess --- initialize temporary file access during startup
     833                 :  *
     834                 :  * This is called during either normal or standalone backend start.
     835                 :  * It is *not* called in the postmaster.
     836                 :  *
     837                 :  * This is separate from InitFileAccess() because temporary file cleanup can
     838                 :  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
     839                 :  * our reporting has to happen before that. Low level file access should be
     840 EUB             :  * available for longer, hence the separate initialization / shutdown of
     841                 :  * temporary file handling.
     842                 :  */
     843 ECB             : void
     844 CBC       13291 : InitTemporaryFileAccess(void)
     845                 : {
     846 GIC       13291 :     Assert(SizeVfdCache != 0);  /* InitFileAccess() needs to have run */
     847 GBC       13291 :     Assert(!temporary_files_allowed);   /* call me only once */
     848 EUB             : 
     849                 :     /*
     850                 :      * Register before-shmem-exit hook to ensure temp files are dropped while
     851                 :      * we can still report stats.
     852 ECB             :      */
     853 GIC       13291 :     before_shmem_exit(BeforeShmemExit_Files, 0);
     854 EUB             : 
     855                 : #ifdef USE_ASSERT_CHECKING
     856 GIC       13291 :     temporary_files_allowed = true;
     857 ECB             : #endif
     858 GIC       13291 : }
     859 ECB             : 
     860                 : /*
     861                 :  * count_usable_fds --- count how many FDs the system will let us open,
     862                 :  *      and estimate how many are already open.
     863                 :  *
     864                 :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     865                 :  * value of max_to_probe might result in an underestimate of already_open;
     866                 :  * we must fill in any "gaps" in the set of used FDs before the calculation
     867                 :  * of already_open will give the right answer.  In practice, max_to_probe
     868                 :  * of a couple of dozen should be enough to ensure good results.
     869                 :  *
     870                 :  * We assume stderr (FD 2) is available for dup'ing.  While the calling
     871                 :  * script could theoretically close that, it would be a really bad idea,
     872                 :  * since then one risks loss of error messages from, e.g., libc.
     873                 :  */
     874                 : static void
     875 GIC         593 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     876                 : {
     877 ECB             :     int        *fd;
     878                 :     int         size;
     879 CBC         593 :     int         used = 0;
     880 GIC         593 :     int         highestfd = 0;
     881                 :     int         j;
     882                 : 
     883                 : #ifdef HAVE_GETRLIMIT
     884                 :     struct rlimit rlim;
     885                 :     int         getrlimit_status;
     886 ECB             : #endif
     887                 : 
     888 GIC         593 :     size = 1024;
     889             593 :     fd = (int *) palloc(size * sizeof(int));
     890                 : 
     891                 : #ifdef HAVE_GETRLIMIT
     892             593 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     893             593 :     if (getrlimit_status != 0)
     894 UIC           0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     895 ECB             : #endif                          /* HAVE_GETRLIMIT */
     896                 : 
     897                 :     /* dup until failure or probe limit reached */
     898                 :     for (;;)
     899 GIC      592407 :     {
     900                 :         int         thisfd;
     901                 : 
     902                 : #ifdef HAVE_GETRLIMIT
     903 ECB             : 
     904                 :         /*
     905                 :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     906                 :          * some platforms
     907                 :          */
     908 CBC      593000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     909 UBC           0 :             break;
     910                 : #endif
     911                 : 
     912 GIC      593000 :         thisfd = dup(2);
     913          593000 :         if (thisfd < 0)
     914                 :         {
     915                 :             /* Expect EMFILE or ENFILE, else it's fishy */
     916 LBC           0 :             if (errno != EMFILE && errno != ENFILE)
     917 UIC           0 :                 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
     918 LBC           0 :             break;
     919                 :         }
     920                 : 
     921 GIC      593000 :         if (used >= size)
     922                 :         {
     923 UIC           0 :             size *= 2;
     924               0 :             fd = (int *) repalloc(fd, size * sizeof(int));
     925 ECB             :         }
     926 GIC      593000 :         fd[used++] = thisfd;
     927 ECB             : 
     928 GIC      593000 :         if (highestfd < thisfd)
     929          593000 :             highestfd = thisfd;
     930                 : 
     931          593000 :         if (used >= max_to_probe)
     932             593 :             break;
     933                 :     }
     934                 : 
     935                 :     /* release the files we opened */
     936          593593 :     for (j = 0; j < used; j++)
     937          593000 :         close(fd[j]);
     938                 : 
     939             593 :     pfree(fd);
     940                 : 
     941                 :     /*
     942                 :      * Return results.  usable_fds is just the number of successful dups. We
     943                 :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
     944                 :      * number) and so already_open is highestfd+1 - usable_fds.
     945                 :      */
     946             593 :     *usable_fds = used;
     947 CBC         593 :     *already_open = highestfd + 1 - used;
     948 GIC         593 : }
     949                 : 
     950                 : /*
     951 ECB             :  * set_max_safe_fds
     952                 :  *      Determine number of file descriptors that fd.c is allowed to use
     953                 :  */
     954                 : void
     955 GIC         593 : set_max_safe_fds(void)
     956                 : {
     957                 :     int         usable_fds;
     958                 :     int         already_open;
     959                 : 
     960                 :     /*----------
     961                 :      * We want to set max_safe_fds to
     962                 :      *          MIN(usable_fds, max_files_per_process - already_open)
     963                 :      * less the slop factor for files that are opened without consulting
     964                 :      * fd.c.  This ensures that we won't exceed either max_files_per_process
     965                 :      * or the experimentally-determined EMFILE limit.
     966                 :      *----------
     967                 :      */
     968             593 :     count_usable_fds(max_files_per_process,
     969                 :                      &usable_fds, &already_open);
     970                 : 
     971             593 :     max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
     972 ECB             : 
     973                 :     /*
     974                 :      * Take off the FDs reserved for system() etc.
     975                 :      */
     976 GIC         593 :     max_safe_fds -= NUM_RESERVED_FDS;
     977                 : 
     978                 :     /*
     979                 :      * Make sure we still have enough to get by.
     980                 :      */
     981             593 :     if (max_safe_fds < FD_MINFREE)
     982 UIC           0 :         ereport(FATAL,
     983                 :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
     984                 :                  errmsg("insufficient file descriptors available to start server process"),
     985                 :                  errdetail("System allows %d, server needs at least %d.",
     986                 :                            max_safe_fds + NUM_RESERVED_FDS,
     987                 :                            FD_MINFREE + NUM_RESERVED_FDS)));
     988                 : 
     989 GIC         593 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
     990                 :          max_safe_fds, usable_fds, already_open);
     991 CBC         593 : }
     992                 : 
     993                 : /*
     994 ECB             :  * Open a file with BasicOpenFilePerm() and pass default file mode for the
     995                 :  * fileMode parameter.
     996 EUB             :  */
     997                 : int
     998 GBC       24767 : BasicOpenFile(const char *fileName, int fileFlags)
     999                 : {
    1000 GIC       24767 :     return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1001 EUB             : }
    1002                 : 
    1003                 : /*
    1004                 :  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
    1005                 :  *
    1006                 :  * This is exported for use by places that really want a plain kernel FD,
    1007 ECB             :  * but need to be proof against running out of FDs.  Once an FD has been
    1008                 :  * successfully returned, it is the caller's responsibility to ensure that
    1009                 :  * it will not be leaked on ereport()!  Most users should *not* call this
    1010                 :  * routine directly, but instead use the VFD abstraction level, which
    1011                 :  * provides protection against descriptor leaks as well as management of
    1012                 :  * files that need to be open for more than a short period of time.
    1013                 :  *
    1014                 :  * Ideally this should be the *only* direct call of open() in the backend.
    1015                 :  * In practice, the postmaster calls open() directly, and there are some
    1016                 :  * direct open() calls done early in backend startup.  Those are OK since
    1017                 :  * this module wouldn't have any open files to close at that point anyway.
    1018                 :  */
    1019                 : int
    1020 GIC     2476900 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1021                 : {
    1022                 :     int         fd;
    1023                 : 
    1024 CBC     2476900 : tryAgain:
    1025                 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1026                 : 
    1027                 :     /*
    1028                 :      * The value we defined to stand in for O_DIRECT when simulating it with
    1029                 :      * F_NOCACHE had better not collide with any of the standard flags.
    1030 ECB             :      */
    1031                 :     StaticAssertStmt((PG_O_DIRECT &
    1032                 :                       (O_APPEND |
    1033                 :                        O_CLOEXEC |
    1034                 :                        O_CREAT |
    1035                 :                        O_DSYNC |
    1036                 :                        O_EXCL |
    1037 EUB             :                        O_RDWR |
    1038                 :                        O_RDONLY |
    1039                 :                        O_SYNC |
    1040                 :                        O_TRUNC |
    1041                 :                        O_WRONLY)) == 0,
    1042                 :                      "PG_O_DIRECT value collides with standard flag");
    1043                 :     fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
    1044                 : #else
    1045 GIC     2476900 :     fd = open(fileName, fileFlags, fileMode);
    1046                 : #endif
    1047                 : 
    1048         2476900 :     if (fd >= 0)
    1049                 :     {
    1050                 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1051                 :         if (fileFlags & PG_O_DIRECT)
    1052 ECB             :         {
    1053                 :             if (fcntl(fd, F_NOCACHE, 1) < 0)
    1054                 :             {
    1055                 :                 int         save_errno = errno;
    1056                 : 
    1057                 :                 close(fd);
    1058                 :                 errno = save_errno;
    1059                 :                 return -1;
    1060                 :             }
    1061                 :         }
    1062                 : #endif
    1063                 : 
    1064 GIC     1998010 :         return fd;              /* success! */
    1065                 :     }
    1066                 : 
    1067          478890 :     if (errno == EMFILE || errno == ENFILE)
    1068                 :     {
    1069 UIC           0 :         int         save_errno = errno;
    1070 ECB             : 
    1071 UIC           0 :         ereport(LOG,
    1072 ECB             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1073                 :                  errmsg("out of file descriptors: %m; release and retry")));
    1074 LBC           0 :         errno = 0;
    1075 UIC           0 :         if (ReleaseLruFile())
    1076               0 :             goto tryAgain;
    1077               0 :         errno = save_errno;
    1078                 :     }
    1079                 : 
    1080 GIC      478890 :     return -1;                  /* failure */
    1081                 : }
    1082                 : 
    1083                 : /*
    1084                 :  * AcquireExternalFD - attempt to reserve an external file descriptor
    1085                 :  *
    1086                 :  * This should be used by callers that need to hold a file descriptor open
    1087                 :  * over more than a short interval, but cannot use any of the other facilities
    1088                 :  * provided by this module.
    1089                 :  *
    1090                 :  * The difference between this and the underlying ReserveExternalFD function
    1091                 :  * is that this will report failure (by setting errno and returning false)
    1092                 :  * if "too many" external FDs are already reserved.  This should be used in
    1093                 :  * any code where the total number of FDs to be reserved is not predictable
    1094                 :  * and small.
    1095                 :  */
    1096                 : bool
    1097          126279 : AcquireExternalFD(void)
    1098                 : {
    1099 ECB             :     /*
    1100                 :      * We don't want more than max_safe_fds / 3 FDs to be consumed for
    1101                 :      * "external" FDs.
    1102                 :      */
    1103 CBC      126279 :     if (numExternalFDs < max_safe_fds / 3)
    1104                 :     {
    1105 GIC      126279 :         ReserveExternalFD();
    1106          126279 :         return true;
    1107                 :     }
    1108 UIC           0 :     errno = EMFILE;
    1109 LBC           0 :     return false;
    1110                 : }
    1111 ECB             : 
    1112                 : /*
    1113                 :  * ReserveExternalFD - report external consumption of a file descriptor
    1114                 :  *
    1115                 :  * This should be used by callers that need to hold a file descriptor open
    1116                 :  * over more than a short interval, but cannot use any of the other facilities
    1117                 :  * provided by this module.  This just tracks the use of the FD and closes
    1118                 :  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
    1119                 :  *
    1120                 :  * Call this directly only in code where failure to reserve the FD would be
    1121                 :  * fatal; for example, the WAL-writing code does so, since the alternative is
    1122                 :  * session failure.  Also, it's very unwise to do so in code that could
    1123                 :  * consume more than one FD per process.
    1124                 :  *
    1125                 :  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
    1126                 :  * available, it doesn't matter too much whether this is called before or
    1127                 :  * after actually opening the FD; but doing so beforehand reduces the risk of
    1128                 :  * an EMFILE failure if not everybody played nice.  In any case, it's solely
    1129                 :  * caller's responsibility to keep the external-FD count in sync with reality.
    1130                 :  */
    1131                 : void
    1132 GIC      175848 : ReserveExternalFD(void)
    1133 ECB             : {
    1134 EUB             :     /*
    1135                 :      * Release VFDs if needed to stay safe.  Because we do this before
    1136 ECB             :      * incrementing numExternalFDs, the final state will be as desired, i.e.,
    1137                 :      * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
    1138                 :      */
    1139 GIC      175848 :     ReleaseLruFiles();
    1140 ECB             : 
    1141 CBC      175848 :     numExternalFDs++;
    1142 GIC      175848 : }
    1143                 : 
    1144 ECB             : /*
    1145                 :  * ReleaseExternalFD - report release of an external file descriptor
    1146                 :  *
    1147                 :  * This is guaranteed not to change errno, so it can be used in failure paths.
    1148                 :  */
    1149                 : void
    1150 GIC      159582 : ReleaseExternalFD(void)
    1151                 : {
    1152          159582 :     Assert(numExternalFDs > 0);
    1153          159582 :     numExternalFDs--;
    1154 CBC      159582 : }
    1155                 : 
    1156 ECB             : 
    1157                 : #if defined(FDDEBUG)
    1158                 : 
    1159                 : static void
    1160                 : _dump_lru(void)
    1161                 : {
    1162                 :     int         mru = VfdCache[0].lruLessRecently;
    1163                 :     Vfd        *vfdP = &VfdCache[mru];
    1164                 :     char        buf[2048];
    1165                 : 
    1166                 :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    1167                 :     while (mru != 0)
    1168                 :     {
    1169                 :         mru = vfdP->lruLessRecently;
    1170                 :         vfdP = &VfdCache[mru];
    1171                 :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    1172                 :     }
    1173                 :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    1174                 :     elog(LOG, "%s", buf);
    1175                 : }
    1176                 : #endif                          /* FDDEBUG */
    1177                 : 
    1178                 : static void
    1179 GIC     2054304 : Delete(File file)
    1180 ECB             : {
    1181                 :     Vfd        *vfdP;
    1182                 : 
    1183 GIC     2054304 :     Assert(file != 0);
    1184                 : 
    1185                 :     DO_DB(elog(LOG, "Delete %d (%s)",
    1186                 :                file, VfdCache[file].fileName));
    1187 ECB             :     DO_DB(_dump_lru());
    1188                 : 
    1189 CBC     2054304 :     vfdP = &VfdCache[file];
    1190                 : 
    1191 GIC     2054304 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    1192 GBC     2054304 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
    1193                 : 
    1194                 :     DO_DB(_dump_lru());
    1195 GIC     2054304 : }
    1196 ECB             : 
    1197                 : static void
    1198 GIC      264855 : LruDelete(File file)
    1199                 : {
    1200                 :     Vfd        *vfdP;
    1201                 : 
    1202          264855 :     Assert(file != 0);
    1203                 : 
    1204 ECB             :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1205                 :                file, VfdCache[file].fileName));
    1206                 : 
    1207 GIC      264855 :     vfdP = &VfdCache[file];
    1208                 : 
    1209                 :     /*
    1210                 :      * Close the file.  We aren't expecting this to fail; if it does, better
    1211                 :      * to leak the FD than to mess up our internal state.
    1212                 :      */
    1213 CBC      264855 :     if (close(vfdP->fd) != 0)
    1214 UIC           0 :         elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1215                 :              "could not close file \"%s\": %m", vfdP->fileName);
    1216 GIC      264855 :     vfdP->fd = VFD_CLOSED;
    1217 CBC      264855 :     --nfile;
    1218                 : 
    1219                 :     /* delete the vfd record from the LRU ring */
    1220 GIC      264855 :     Delete(file);
    1221          264855 : }
    1222                 : 
    1223 ECB             : static void
    1224 CBC     2283732 : Insert(File file)
    1225 ECB             : {
    1226                 :     Vfd        *vfdP;
    1227 EUB             : 
    1228 GIC     2283732 :     Assert(file != 0);
    1229                 : 
    1230                 :     DO_DB(elog(LOG, "Insert %d (%s)",
    1231                 :                file, VfdCache[file].fileName));
    1232                 :     DO_DB(_dump_lru());
    1233                 : 
    1234         2283732 :     vfdP = &VfdCache[file];
    1235 ECB             : 
    1236 GIC     2283732 :     vfdP->lruMoreRecently = 0;
    1237 CBC     2283732 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1238 GIC     2283732 :     VfdCache[0].lruLessRecently = file;
    1239 CBC     2283732 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1240 EUB             : 
    1241                 :     DO_DB(_dump_lru());
    1242 CBC     2283732 : }
    1243                 : 
    1244                 : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1245 ECB             : static int
    1246 GIC      105661 : LruInsert(File file)
    1247                 : {
    1248                 :     Vfd        *vfdP;
    1249                 : 
    1250          105661 :     Assert(file != 0);
    1251                 : 
    1252 ECB             :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1253                 :                file, VfdCache[file].fileName));
    1254                 : 
    1255 GIC      105661 :     vfdP = &VfdCache[file];
    1256                 : 
    1257          105661 :     if (FileIsNotOpen(file))
    1258                 :     {
    1259                 :         /* Close excess kernel FDs. */
    1260          105661 :         ReleaseLruFiles();
    1261 ECB             : 
    1262                 :         /*
    1263                 :          * The open could still fail for lack of file descriptors, eg due to
    1264                 :          * overall system file table being full.  So, be prepared to release
    1265                 :          * another FD if necessary...
    1266                 :          */
    1267 GIC      105661 :         vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
    1268                 :                                      vfdP->fileMode);
    1269          105661 :         if (vfdP->fd < 0)
    1270 ECB             :         {
    1271                 :             DO_DB(elog(LOG, "re-open failed: %m"));
    1272 UBC           0 :             return -1;
    1273                 :         }
    1274                 :         else
    1275 ECB             :         {
    1276 GIC      105661 :             ++nfile;
    1277                 :         }
    1278                 :     }
    1279                 : 
    1280 ECB             :     /*
    1281                 :      * put it at the head of the Lru ring
    1282                 :      */
    1283                 : 
    1284 CBC      105661 :     Insert(file);
    1285                 : 
    1286          105661 :     return 0;
    1287 ECB             : }
    1288                 : 
    1289                 : /*
    1290                 :  * Release one kernel FD by closing the least-recently-used VFD.
    1291                 :  */
    1292                 : static bool
    1293 GIC      264791 : ReleaseLruFile(void)
    1294                 : {
    1295 ECB             :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1296                 : 
    1297 CBC      264791 :     if (nfile > 0)
    1298                 :     {
    1299 ECB             :         /*
    1300                 :          * There are opened files and so there should be at least one used vfd
    1301                 :          * in the ring.
    1302                 :          */
    1303 CBC      264791 :         Assert(VfdCache[0].lruMoreRecently != 0);
    1304 GIC      264791 :         LruDelete(VfdCache[0].lruMoreRecently);
    1305 CBC      264791 :         return true;            /* freed a file */
    1306                 :     }
    1307 UIC           0 :     return false;               /* no files available to free */
    1308                 : }
    1309                 : 
    1310 ECB             : /*
    1311                 :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1312                 :  * After calling this, it's OK to try to open another file.
    1313                 :  */
    1314                 : static void
    1315 CBC     2733022 : ReleaseLruFiles(void)
    1316                 : {
    1317         2997813 :     while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
    1318 ECB             :     {
    1319 CBC      264791 :         if (!ReleaseLruFile())
    1320 UIC           0 :             break;
    1321                 :     }
    1322 GIC     2733022 : }
    1323 ECB             : 
    1324                 : static File
    1325 GIC     1746847 : AllocateVfd(void)
    1326                 : {
    1327                 :     Index       i;
    1328                 :     File        file;
    1329                 : 
    1330                 :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1331                 : 
    1332         1746847 :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1333                 : 
    1334         1746847 :     if (VfdCache[0].nextFree == 0)
    1335 ECB             :     {
    1336                 :         /*
    1337                 :          * The free list is empty so it is time to increase the size of the
    1338                 :          * array.  We choose to double it each time this happens. However,
    1339 EUB             :          * there's not much point in starting *real* small.
    1340                 :          */
    1341 CBC       17078 :         Size        newCacheSize = SizeVfdCache * 2;
    1342                 :         Vfd        *newVfdCache;
    1343                 : 
    1344 GIC       17078 :         if (newCacheSize < 32)
    1345           11791 :             newCacheSize = 32;
    1346                 : 
    1347                 :         /*
    1348 ECB             :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1349                 :          */
    1350 GIC       17078 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1351           17078 :         if (newVfdCache == NULL)
    1352 LBC           0 :             ereport(ERROR,
    1353                 :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1354                 :                      errmsg("out of memory")));
    1355 GIC       17078 :         VfdCache = newVfdCache;
    1356                 : 
    1357                 :         /*
    1358                 :          * Initialize the new entries and link them into the free list.
    1359 ECB             :          */
    1360 GIC      811335 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1361 ECB             :         {
    1362 GIC     6354056 :             MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
    1363 CBC      794257 :             VfdCache[i].nextFree = i + 1;
    1364 GIC      794257 :             VfdCache[i].fd = VFD_CLOSED;
    1365 ECB             :         }
    1366 CBC       17078 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1367 GIC       17078 :         VfdCache[0].nextFree = SizeVfdCache;
    1368                 : 
    1369                 :         /*
    1370 ECB             :          * Record the new size
    1371                 :          */
    1372 GIC       17078 :         SizeVfdCache = newCacheSize;
    1373                 :     }
    1374                 : 
    1375         1746847 :     file = VfdCache[0].nextFree;
    1376                 : 
    1377         1746847 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1378 ECB             : 
    1379 GIC     1746847 :     return file;
    1380 ECB             : }
    1381                 : 
    1382                 : static void
    1383 GIC     1453862 : FreeVfd(File file)
    1384 ECB             : {
    1385 CBC     1453862 :     Vfd        *vfdP = &VfdCache[file];
    1386 ECB             : 
    1387                 :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1388                 :                file, vfdP->fileName ? vfdP->fileName : ""));
    1389                 : 
    1390 GIC     1453862 :     if (vfdP->fileName != NULL)
    1391                 :     {
    1392          978713 :         free(vfdP->fileName);
    1393          978713 :         vfdP->fileName = NULL;
    1394                 :     }
    1395         1453862 :     vfdP->fdstate = 0x0;
    1396                 : 
    1397         1453862 :     vfdP->nextFree = VfdCache[0].nextFree;
    1398         1453862 :     VfdCache[0].nextFree = file;
    1399         1453862 : }
    1400                 : 
    1401                 : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1402                 : static int
    1403         3545355 : FileAccess(File file)
    1404                 : {
    1405                 :     int         returnValue;
    1406 ECB             : 
    1407                 :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1408                 :                file, VfdCache[file].fileName));
    1409                 : 
    1410                 :     /*
    1411                 :      * Is the file open?  If not, open it and put it at the head of the LRU
    1412                 :      * ring (possibly closing the least recently used file to get an FD).
    1413                 :      */
    1414                 : 
    1415 GIC     3545355 :     if (FileIsNotOpen(file))
    1416                 :     {
    1417          105661 :         returnValue = LruInsert(file);
    1418          105661 :         if (returnValue != 0)
    1419 LBC           0 :             return returnValue;
    1420                 :     }
    1421 GIC     3439694 :     else if (VfdCache[0].lruLessRecently != file)
    1422                 :     {
    1423                 :         /*
    1424                 :          * We now know that the file is open and that it is not the last one
    1425                 :          * accessed, so we need to move it to the head of the Lru ring.
    1426                 :          */
    1427                 : 
    1428          906373 :         Delete(file);
    1429          906373 :         Insert(file);
    1430                 :     }
    1431 ECB             : 
    1432 CBC     3545355 :     return 0;
    1433 EUB             : }
    1434                 : 
    1435                 : /*
    1436                 :  * Called whenever a temporary file is deleted to report its size.
    1437 ECB             :  */
    1438                 : static void
    1439 GIC        3222 : ReportTemporaryFileUsage(const char *path, off_t size)
    1440                 : {
    1441 CBC        3222 :     pgstat_report_tempfile(size);
    1442                 : 
    1443 GIC        3222 :     if (log_temp_files >= 0)
    1444                 :     {
    1445             874 :         if ((size / 1024) >= log_temp_files)
    1446             135 :             ereport(LOG,
    1447                 :                     (errmsg("temporary file: path \"%s\", size %lu",
    1448                 :                             path, (unsigned long) size)));
    1449 ECB             :     }
    1450 GIC        3222 : }
    1451 ECB             : 
    1452                 : /*
    1453                 :  * Called to register a temporary file for automatic close.
    1454                 :  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
    1455                 :  * before the file was opened.
    1456                 :  */
    1457                 : static void
    1458 CBC        5110 : RegisterTemporaryFile(File file)
    1459 ECB             : {
    1460 CBC        5110 :     ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1461 GIC        5110 :     VfdCache[file].resowner = CurrentResourceOwner;
    1462 ECB             : 
    1463                 :     /* Backup mechanism for closing at end of xact. */
    1464 GIC        5110 :     VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
    1465            5110 :     have_xact_temporary_files = true;
    1466 CBC        5110 : }
    1467                 : 
    1468 ECB             : /*
    1469                 :  *  Called when we get a shared invalidation message on some relation.
    1470                 :  */
    1471                 : #ifdef NOT_USED
    1472                 : void
    1473                 : FileInvalidate(File file)
    1474                 : {
    1475                 :     Assert(FileIsValid(file));
    1476                 :     if (!FileIsNotOpen(file))
    1477                 :         LruDelete(file);
    1478                 : }
    1479                 : #endif
    1480                 : 
    1481                 : /*
    1482                 :  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
    1483                 :  * fileMode parameter.
    1484                 :  */
    1485                 : File
    1486 GIC     1746847 : PathNameOpenFile(const char *fileName, int fileFlags)
    1487                 : {
    1488         1746847 :     return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1489                 : }
    1490                 : 
    1491 ECB             : /*
    1492                 :  * open a file in an arbitrary directory
    1493                 :  *
    1494                 :  * NB: if the passed pathname is relative (which it usually is),
    1495                 :  * it will be interpreted relative to the process' working directory
    1496                 :  * (which should always be $PGDATA when this code is running).
    1497                 :  */
    1498                 : File
    1499 GIC     1746847 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1500                 : {
    1501                 :     char       *fnamecopy;
    1502                 :     File        file;
    1503 ECB             :     Vfd        *vfdP;
    1504 EUB             : 
    1505                 :     DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
    1506                 :                fileName, fileFlags, fileMode));
    1507                 : 
    1508                 :     /*
    1509                 :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1510 ECB             :      */
    1511 GBC     1746847 :     fnamecopy = strdup(fileName);
    1512 GIC     1746847 :     if (fnamecopy == NULL)
    1513 UIC           0 :         ereport(ERROR,
    1514                 :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1515                 :                  errmsg("out of memory")));
    1516                 : 
    1517 GIC     1746847 :     file = AllocateVfd();
    1518         1746847 :     vfdP = &VfdCache[file];
    1519                 : 
    1520                 :     /* Close excess kernel FDs. */
    1521         1746847 :     ReleaseLruFiles();
    1522 ECB             : 
    1523                 :     /*
    1524                 :      * Descriptors managed by VFDs are implicitly marked O_CLOEXEC.  The
    1525                 :      * client shouldn't be expected to know which kernel descriptors are
    1526                 :      * currently open, so it wouldn't make sense for them to be inherited by
    1527                 :      * executed subprograms.
    1528                 :      */
    1529 GNC     1746847 :     fileFlags |= O_CLOEXEC;
    1530                 : 
    1531 GIC     1746847 :     vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    1532                 : 
    1533         1746847 :     if (vfdP->fd < 0)
    1534                 :     {
    1535 CBC      475149 :         int         save_errno = errno;
    1536 ECB             : 
    1537 GIC      475149 :         FreeVfd(file);
    1538          475149 :         free(fnamecopy);
    1539          475149 :         errno = save_errno;
    1540          475149 :         return -1;
    1541                 :     }
    1542         1271698 :     ++nfile;
    1543                 :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1544                 :                vfdP->fd));
    1545 ECB             : 
    1546 GIC     1271698 :     vfdP->fileName = fnamecopy;
    1547                 :     /* Saved flags are adjusted to be OK for re-opening file */
    1548         1271698 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1549         1271698 :     vfdP->fileMode = fileMode;
    1550         1271698 :     vfdP->fileSize = 0;
    1551         1271698 :     vfdP->fdstate = 0x0;
    1552         1271698 :     vfdP->resowner = NULL;
    1553                 : 
    1554         1271698 :     Insert(file);
    1555                 : 
    1556         1271698 :     return file;
    1557                 : }
    1558                 : 
    1559                 : /*
    1560                 :  * Create directory 'directory'.  If necessary, create 'basedir', which must
    1561                 :  * be the directory above it.  This is designed for creating the top-level
    1562                 :  * temporary directory on demand before creating a directory underneath it.
    1563 ECB             :  * Do nothing if the directory already exists.
    1564                 :  *
    1565                 :  * Directories created within the top-level temporary directory should begin
    1566                 :  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
    1567                 :  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
    1568                 :  * that do not need any particular prefix.
    1569                 : */
    1570                 : void
    1571 GIC         171 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
    1572                 : {
    1573 CBC         171 :     if (MakePGDirectory(directory) < 0)
    1574 ECB             :     {
    1575 GIC          18 :         if (errno == EEXIST)
    1576               9 :             return;
    1577                 : 
    1578                 :         /*
    1579                 :          * Failed.  Try to create basedir first in case it's missing. Tolerate
    1580                 :          * EEXIST to close a race against another process following the same
    1581                 :          * algorithm.
    1582                 :          */
    1583               9 :         if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
    1584 UIC           0 :             ereport(ERROR,
    1585 ECB             :                     (errcode_for_file_access(),
    1586                 :                      errmsg("cannot create temporary directory \"%s\": %m",
    1587                 :                             basedir)));
    1588                 : 
    1589                 :         /* Try again. */
    1590 CBC           9 :         if (MakePGDirectory(directory) < 0 && errno != EEXIST)
    1591 UIC           0 :             ereport(ERROR,
    1592                 :                     (errcode_for_file_access(),
    1593                 :                      errmsg("cannot create temporary subdirectory \"%s\": %m",
    1594                 :                             directory)));
    1595                 :     }
    1596                 : }
    1597                 : 
    1598 ECB             : /*
    1599                 :  * Delete a directory and everything in it, if it exists.
    1600                 :  */
    1601                 : void
    1602 GIC         198 : PathNameDeleteTemporaryDir(const char *dirname)
    1603                 : {
    1604                 :     struct stat statbuf;
    1605 ECB             : 
    1606                 :     /* Silently ignore missing directory. */
    1607 GIC         198 :     if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
    1608 CBC          36 :         return;
    1609 ECB             : 
    1610                 :     /*
    1611                 :      * Currently, walkdir doesn't offer a way for our passed in function to
    1612                 :      * maintain state.  Perhaps it should, so that we could tell the caller
    1613                 :      * whether this operation succeeded or failed.  Since this operation is
    1614                 :      * used in a cleanup path, we wouldn't actually behave differently: we'll
    1615                 :      * just log failures.
    1616                 :      */
    1617 GIC         162 :     walkdir(dirname, unlink_if_exists_fname, false, LOG);
    1618 ECB             : }
    1619                 : 
    1620                 : /*
    1621                 :  * Open a temporary file that will disappear when we close it.
    1622                 :  *
    1623                 :  * This routine takes care of generating an appropriate tempfile name.
    1624                 :  * There's no need to pass in fileFlags or fileMode either, since only
    1625                 :  * one setting makes any sense for a temp file.
    1626                 :  *
    1627                 :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1628                 :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1629                 :  * the end-of-transaction. In most cases, you don't want temporary files to
    1630                 :  * outlive the transaction that created them, so this should be false -- but
    1631                 :  * if you need "somewhat" temporary storage, this might be useful. In either
    1632                 :  * case, the file is removed when the File is explicitly closed.
    1633                 :  */
    1634                 : File
    1635 GIC        1783 : OpenTemporaryFile(bool interXact)
    1636 ECB             : {
    1637 GIC        1783 :     File        file = 0;
    1638                 : 
    1639            1783 :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1640                 : 
    1641                 :     /*
    1642                 :      * Make sure the current resource owner has space for this File before we
    1643 ECB             :      * open it, if we'll be registering it below.
    1644                 :      */
    1645 GIC        1783 :     if (!interXact)
    1646            1777 :         ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1647                 : 
    1648                 :     /*
    1649 ECB             :      * If some temp tablespace(s) have been given to us, try to use the next
    1650                 :      * one.  If a given tablespace can't be found, we silently fall back to
    1651                 :      * the database's default tablespace.
    1652                 :      *
    1653                 :      * BUT: if the temp file is slated to outlive the current transaction,
    1654                 :      * force it into the database's default tablespace, so that it will not
    1655                 :      * pose a threat to possible tablespace drop attempts.
    1656                 :      */
    1657 GIC        1783 :     if (numTempTableSpaces > 0 && !interXact)
    1658                 :     {
    1659               1 :         Oid         tblspcOid = GetNextTempTableSpace();
    1660                 : 
    1661               1 :         if (OidIsValid(tblspcOid))
    1662 CBC           1 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1663                 :     }
    1664 ECB             : 
    1665                 :     /*
    1666                 :      * If not, or if tablespace is bad, create in database's default
    1667                 :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1668                 :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1669                 :      */
    1670 GIC        1783 :     if (file <= 0)
    1671            1782 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1672                 :                                              MyDatabaseTableSpace :
    1673                 :                                              DEFAULTTABLESPACE_OID,
    1674 ECB             :                                              true);
    1675                 : 
    1676                 :     /* Mark it for deletion at close and temporary file size limit */
    1677 GIC        1783 :     VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
    1678 ECB             : 
    1679 EUB             :     /* Register it with the current resource owner */
    1680 GIC        1783 :     if (!interXact)
    1681            1777 :         RegisterTemporaryFile(file);
    1682                 : 
    1683 CBC        1783 :     return file;
    1684                 : }
    1685                 : 
    1686                 : /*
    1687                 :  * Return the path of the temp directory in a given tablespace.
    1688                 :  */
    1689                 : void
    1690 GIC        9649 : TempTablespacePath(char *path, Oid tablespace)
    1691                 : {
    1692                 :     /*
    1693                 :      * Identify the tempfile directory for this tablespace.
    1694                 :      *
    1695                 :      * If someone tries to specify pg_global, use pg_default instead.
    1696                 :      */
    1697            9649 :     if (tablespace == InvalidOid ||
    1698               1 :         tablespace == DEFAULTTABLESPACE_OID ||
    1699                 :         tablespace == GLOBALTABLESPACE_OID)
    1700 CBC        9648 :         snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
    1701                 :     else
    1702                 :     {
    1703                 :         /* All other tablespaces are accessed via symlinks */
    1704               1 :         snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
    1705                 :                  tablespace, TABLESPACE_VERSION_DIRECTORY,
    1706 ECB             :                  PG_TEMP_FILES_DIR);
    1707                 :     }
    1708 GIC        9649 : }
    1709                 : 
    1710                 : /*
    1711                 :  * Open a temporary file in a specific tablespace.
    1712 ECB             :  * Subroutine for OpenTemporaryFile, which see for details.
    1713                 :  */
    1714                 : static File
    1715 CBC        1783 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1716 EUB             : {
    1717                 :     char        tempdirpath[MAXPGPATH];
    1718                 :     char        tempfilepath[MAXPGPATH];
    1719                 :     File        file;
    1720                 : 
    1721 CBC        1783 :     TempTablespacePath(tempdirpath, tblspcOid);
    1722                 : 
    1723                 :     /*
    1724                 :      * Generate a tempfile name that should be unique within the current
    1725 ECB             :      * database instance.
    1726                 :      */
    1727 GIC        1783 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1728 ECB             :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1729                 : 
    1730                 :     /*
    1731                 :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1732                 :      * temp file that can be reused.
    1733                 :      */
    1734 GIC        1783 :     file = PathNameOpenFile(tempfilepath,
    1735                 :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1736            1783 :     if (file <= 0)
    1737                 :     {
    1738                 :         /*
    1739                 :          * We might need to create the tablespace's tempfile directory, if no
    1740 ECB             :          * one has yet done so.
    1741                 :          *
    1742                 :          * Don't check for an error from MakePGDirectory; it could fail if
    1743                 :          * someone else just did the same thing.  If it doesn't work then
    1744                 :          * we'll bomb out on the second create attempt, instead.
    1745                 :          */
    1746 CBC          71 :         (void) MakePGDirectory(tempdirpath);
    1747                 : 
    1748              71 :         file = PathNameOpenFile(tempfilepath,
    1749                 :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1750 GIC          71 :         if (file <= 0 && rejectError)
    1751 LBC           0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1752 EUB             :                  tempfilepath);
    1753                 :     }
    1754                 : 
    1755 GIC        1783 :     return file;
    1756                 : }
    1757 ECB             : 
    1758                 : 
    1759                 : /*
    1760                 :  * Create a new file.  The directory containing it must already exist.  Files
    1761                 :  * created this way are subject to temp_file_limit and are automatically
    1762                 :  * closed at end of transaction, but are not automatically deleted on close
    1763                 :  * because they are intended to be shared between cooperating backends.
    1764                 :  *
    1765                 :  * If the file is inside the top-level temporary directory, its name should
    1766                 :  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
    1767                 :  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
    1768                 :  * inside a directory created with PathNameCreateTemporaryDir(), in which case
    1769                 :  * the prefix isn't needed.
    1770                 :  */
    1771                 : File
    1772 GIC        1610 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
    1773                 : {
    1774                 :     File        file;
    1775                 : 
    1776            1610 :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1777 ECB             : 
    1778 CBC        1610 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1779                 : 
    1780 ECB             :     /*
    1781                 :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1782                 :      * temp file that can be reused.
    1783                 :      */
    1784 GIC        1610 :     file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1785            1610 :     if (file <= 0)
    1786                 :     {
    1787 CBC         171 :         if (error_on_failure)
    1788 LBC           0 :             ereport(ERROR,
    1789                 :                     (errcode_for_file_access(),
    1790 ECB             :                      errmsg("could not create temporary file \"%s\": %m",
    1791                 :                             path)));
    1792 EUB             :         else
    1793 GBC         171 :             return file;
    1794                 :     }
    1795                 : 
    1796                 :     /* Mark it for temp_file_limit accounting. */
    1797            1439 :     VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
    1798                 : 
    1799                 :     /* Register it for automatic close. */
    1800 CBC        1439 :     RegisterTemporaryFile(file);
    1801 ECB             : 
    1802 GIC        1439 :     return file;
    1803                 : }
    1804 EUB             : 
    1805                 : /*
    1806                 :  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
    1807                 :  * another backend.  Files opened this way don't count against the
    1808                 :  * temp_file_limit of the caller, are automatically closed at the end of the
    1809                 :  * transaction but are not deleted on close.
    1810 ECB             :  */
    1811                 : File
    1812 GIC        4059 : PathNameOpenTemporaryFile(const char *path, int mode)
    1813                 : {
    1814                 :     File        file;
    1815                 : 
    1816            4059 :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1817 ECB             : 
    1818 GIC        4059 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1819                 : 
    1820            4059 :     file = PathNameOpenFile(path, mode | PG_BINARY);
    1821 ECB             : 
    1822                 :     /* If no such file, then we don't raise an error. */
    1823 GIC        4059 :     if (file <= 0 && errno != ENOENT)
    1824 UIC           0 :         ereport(ERROR,
    1825                 :                 (errcode_for_file_access(),
    1826 ECB             :                  errmsg("could not open temporary file \"%s\": %m",
    1827                 :                         path)));
    1828                 : 
    1829 GIC        4059 :     if (file > 0)
    1830                 :     {
    1831 ECB             :         /* Register it for automatic close. */
    1832 GIC        1894 :         RegisterTemporaryFile(file);
    1833                 :     }
    1834                 : 
    1835            4059 :     return file;
    1836                 : }
    1837 EUB             : 
    1838                 : /*
    1839                 :  * Delete a file by pathname.  Return true if the file existed, false if
    1840                 :  * didn't.
    1841 ECB             :  */
    1842                 : bool
    1843 GIC        3228 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
    1844                 : {
    1845 ECB             :     struct stat filestats;
    1846                 :     int         stat_errno;
    1847                 : 
    1848                 :     /* Get the final size for pgstat reporting. */
    1849 GIC        3228 :     if (stat(path, &filestats) != 0)
    1850            1789 :         stat_errno = errno;
    1851 ECB             :     else
    1852 CBC        1439 :         stat_errno = 0;
    1853                 : 
    1854                 :     /*
    1855                 :      * Unlike FileClose's automatic file deletion code, we tolerate
    1856                 :      * non-existence to support BufFileDeleteFileSet which doesn't know how
    1857                 :      * many segments it has to delete until it runs out.
    1858 ECB             :      */
    1859 GIC        3228 :     if (stat_errno == ENOENT)
    1860            1789 :         return false;
    1861                 : 
    1862            1439 :     if (unlink(path) < 0)
    1863                 :     {
    1864 UIC           0 :         if (errno != ENOENT)
    1865               0 :             ereport(error_on_failure ? ERROR : LOG,
    1866                 :                     (errcode_for_file_access(),
    1867                 :                      errmsg("could not unlink temporary file \"%s\": %m",
    1868                 :                             path)));
    1869               0 :         return false;
    1870 ECB             :     }
    1871                 : 
    1872 GIC        1439 :     if (stat_errno == 0)
    1873            1439 :         ReportTemporaryFileUsage(path, filestats.st_size);
    1874 ECB             :     else
    1875 EUB             :     {
    1876 UIC           0 :         errno = stat_errno;
    1877 LBC           0 :         ereport(LOG,
    1878                 :                 (errcode_for_file_access(),
    1879                 :                  errmsg("could not stat file \"%s\": %m", path)));
    1880 ECB             :     }
    1881 EUB             : 
    1882 GIC        1439 :     return true;
    1883                 : }
    1884                 : 
    1885                 : /*
    1886 ECB             :  * close a file when done with it
    1887                 :  */
    1888                 : void
    1889 GIC      978713 : FileClose(File file)
    1890 EUB             : {
    1891                 :     Vfd        *vfdP;
    1892                 : 
    1893 GIC      978713 :     Assert(FileIsValid(file));
    1894                 : 
    1895                 :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1896                 :                file, VfdCache[file].fileName));
    1897                 : 
    1898 CBC      978713 :     vfdP = &VfdCache[file];
    1899 ECB             : 
    1900 GIC      978713 :     if (!FileIsNotOpen(file))
    1901                 :     {
    1902                 :         /* close the file */
    1903          883076 :         if (close(vfdP->fd) != 0)
    1904 ECB             :         {
    1905                 :             /*
    1906                 :              * We may need to panic on failure to close non-temporary files;
    1907                 :              * see LruDelete.
    1908                 :              */
    1909 UIC           0 :             elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1910                 :                  "could not close file \"%s\": %m", vfdP->fileName);
    1911                 :         }
    1912                 : 
    1913 GIC      883076 :         --nfile;
    1914          883076 :         vfdP->fd = VFD_CLOSED;
    1915                 : 
    1916                 :         /* remove the file from the lru ring */
    1917 CBC      883076 :         Delete(file);
    1918                 :     }
    1919                 : 
    1920 GIC      978713 :     if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    1921                 :     {
    1922 ECB             :         /* Subtract its size from current usage (do first in case of error) */
    1923 GIC        3222 :         temporary_files_size -= vfdP->fileSize;
    1924            3222 :         vfdP->fileSize = 0;
    1925                 :     }
    1926                 : 
    1927                 :     /*
    1928 ECB             :      * Delete the file if it was temporary, and make a log entry if wanted
    1929                 :      */
    1930 GBC      978713 :     if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
    1931                 :     {
    1932 ECB             :         struct stat filestats;
    1933                 :         int         stat_errno;
    1934                 : 
    1935                 :         /*
    1936                 :          * If we get an error, as could happen within the ereport/elog calls,
    1937                 :          * we'll come right back here during transaction abort.  Reset the
    1938                 :          * flag to ensure that we can't get into an infinite loop.  This code
    1939                 :          * is arranged to ensure that the worst-case consequence is failing to
    1940                 :          * emit log message(s), not failing to attempt the unlink.
    1941                 :          */
    1942 GIC        1783 :         vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
    1943                 : 
    1944                 : 
    1945 ECB             :         /* first try the stat() */
    1946 GIC        1783 :         if (stat(vfdP->fileName, &filestats))
    1947 UIC           0 :             stat_errno = errno;
    1948                 :         else
    1949 CBC        1783 :             stat_errno = 0;
    1950                 : 
    1951                 :         /* in any case do the unlink */
    1952 GIC        1783 :         if (unlink(vfdP->fileName))
    1953 UIC           0 :             ereport(LOG,
    1954                 :                     (errcode_for_file_access(),
    1955 ECB             :                      errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
    1956 EUB             : 
    1957                 :         /* and last report the stat results */
    1958 CBC        1783 :         if (stat_errno == 0)
    1959 GBC        1783 :             ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
    1960                 :         else
    1961 ECB             :         {
    1962 LBC           0 :             errno = stat_errno;
    1963 UBC           0 :             ereport(LOG,
    1964                 :                     (errcode_for_file_access(),
    1965 ECB             :                      errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
    1966                 :         }
    1967                 :     }
    1968                 : 
    1969                 :     /* Unregister it from the resource owner */
    1970 GIC      978713 :     if (vfdP->resowner)
    1971 CBC        5110 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    1972                 : 
    1973                 :     /*
    1974                 :      * Return the Vfd slot to the free list
    1975                 :      */
    1976 GIC      978713 :     FreeVfd(file);
    1977 CBC      978713 : }
    1978                 : 
    1979                 : /*
    1980                 :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    1981                 :  *
    1982                 :  * Currently the only implementation of this function is using posix_fadvise
    1983                 :  * which is the simplest standardized interface that accomplishes this.
    1984 ECB             :  * We could add an implementation using libaio in the future; but note that
    1985                 :  * this API is inappropriate for libaio, which wants to have a buffer provided
    1986 EUB             :  * to read into.
    1987                 :  */
    1988 ECB             : int
    1989 GNC      212103 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
    1990 ECB             : {
    1991                 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    1992                 :     int         returnCode;
    1993                 : 
    1994 GIC      212103 :     Assert(FileIsValid(file));
    1995 ECB             : 
    1996                 :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    1997                 :                file, VfdCache[file].fileName,
    1998                 :                (int64) offset, (int64) amount));
    1999                 : 
    2000 GIC      212103 :     returnCode = FileAccess(file);
    2001          212103 :     if (returnCode < 0)
    2002 UIC           0 :         return returnCode;
    2003                 : 
    2004 GIC      212103 :     pgstat_report_wait_start(wait_event_info);
    2005          212103 :     returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    2006                 :                                POSIX_FADV_WILLNEED);
    2007          212103 :     pgstat_report_wait_end();
    2008                 : 
    2009          212103 :     return returnCode;
    2010                 : #else
    2011                 :     Assert(FileIsValid(file));
    2012                 :     return 0;
    2013                 : #endif
    2014                 : }
    2015                 : 
    2016                 : void
    2017          134022 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    2018                 : {
    2019 EUB             :     int         returnCode;
    2020                 : 
    2021 GIC      134022 :     Assert(FileIsValid(file));
    2022                 : 
    2023 ECB             :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2024                 :                file, VfdCache[file].fileName,
    2025                 :                (int64) offset, (int64) nbytes));
    2026                 : 
    2027 CBC      134022 :     if (nbytes <= 0)
    2028 UIC           0 :         return;
    2029                 : 
    2030 GNC      134022 :     if (VfdCache[file].fileFlags & PG_O_DIRECT)
    2031 UNC           0 :         return;
    2032                 : 
    2033 GIC      134022 :     returnCode = FileAccess(file);
    2034          134022 :     if (returnCode < 0)
    2035 UIC           0 :         return;
    2036 ECB             : 
    2037 GIC      134022 :     pgstat_report_wait_start(wait_event_info);
    2038          134022 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    2039          134022 :     pgstat_report_wait_end();
    2040                 : }
    2041                 : 
    2042                 : int
    2043 GNC     1721666 : FileRead(File file, void *buffer, size_t amount, off_t offset,
    2044 ECB             :          uint32 wait_event_info)
    2045 EUB             : {
    2046                 :     int         returnCode;
    2047 ECB             :     Vfd        *vfdP;
    2048                 : 
    2049 GIC     1721666 :     Assert(FileIsValid(file));
    2050                 : 
    2051                 :     DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
    2052                 :                file, VfdCache[file].fileName,
    2053                 :                (int64) offset,
    2054                 :                amount, buffer));
    2055                 : 
    2056         1721666 :     returnCode = FileAccess(file);
    2057 CBC     1721666 :     if (returnCode < 0)
    2058 UIC           0 :         return returnCode;
    2059 EUB             : 
    2060 GIC     1721666 :     vfdP = &VfdCache[file];
    2061 EUB             : 
    2062 GIC     1721666 : retry:
    2063 GBC     1721666 :     pgstat_report_wait_start(wait_event_info);
    2064 GIC     1721666 :     returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
    2065 GBC     1721666 :     pgstat_report_wait_end();
    2066 EUB             : 
    2067 GBC     1721666 :     if (returnCode < 0)
    2068                 :     {
    2069                 :         /*
    2070                 :          * Windows may run out of kernel buffers and return "Insufficient
    2071                 :          * system resources" error.  Wait a bit and retry to solve it.
    2072                 :          *
    2073                 :          * It is rumored that EINTR is also possible on some Unix filesystems,
    2074 ECB             :          * in which case immediate retry is indicated.
    2075                 :          */
    2076                 : #ifdef WIN32
    2077                 :         DWORD       error = GetLastError();
    2078                 : 
    2079                 :         switch (error)
    2080                 :         {
    2081                 :             case ERROR_NO_SYSTEM_RESOURCES:
    2082 EUB             :                 pg_usleep(1000L);
    2083                 :                 errno = EINTR;
    2084 ECB             :                 break;
    2085                 :             default:
    2086                 :                 _dosmaperr(error);
    2087                 :                 break;
    2088                 :         }
    2089                 : #endif
    2090                 :         /* OK to retry if interrupted */
    2091 LBC           0 :         if (errno == EINTR)
    2092 UIC           0 :             goto retry;
    2093 ECB             :     }
    2094                 : 
    2095 CBC     1721666 :     return returnCode;
    2096 ECB             : }
    2097                 : 
    2098                 : int
    2099 GNC      993512 : FileWrite(File file, const void *buffer, size_t amount, off_t offset,
    2100                 :           uint32 wait_event_info)
    2101                 : {
    2102                 :     int         returnCode;
    2103                 :     Vfd        *vfdP;
    2104                 : 
    2105 GIC      993512 :     Assert(FileIsValid(file));
    2106                 : 
    2107                 :     DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
    2108                 :                file, VfdCache[file].fileName,
    2109                 :                (int64) offset,
    2110                 :                amount, buffer));
    2111                 : 
    2112          993512 :     returnCode = FileAccess(file);
    2113          993512 :     if (returnCode < 0)
    2114 UIC           0 :         return returnCode;
    2115                 : 
    2116 GIC      993512 :     vfdP = &VfdCache[file];
    2117                 : 
    2118                 :     /*
    2119                 :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    2120 EUB             :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    2121                 :      * really a modularity violation to throw error here; we should set errno
    2122                 :      * and return -1.  However, there's no way to report a suitable error
    2123                 :      * message if we do that.  All current callers would just throw error
    2124 ECB             :      * immediately anyway, so this is safe at present.
    2125                 :      */
    2126 GIC      993512 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
    2127                 :     {
    2128 LBC           0 :         off_t       past_write = offset + amount;
    2129                 : 
    2130 UIC           0 :         if (past_write > vfdP->fileSize)
    2131                 :         {
    2132 LBC           0 :             uint64      newTotal = temporary_files_size;
    2133                 : 
    2134 UIC           0 :             newTotal += past_write - vfdP->fileSize;
    2135               0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    2136               0 :                 ereport(ERROR,
    2137 ECB             :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    2138                 :                          errmsg("temporary file size exceeds temp_file_limit (%dkB)",
    2139 EUB             :                                 temp_file_limit)));
    2140                 :         }
    2141 ECB             :     }
    2142                 : 
    2143 CBC      993512 : retry:
    2144 GIC      993512 :     errno = 0;
    2145 CBC      993512 :     pgstat_report_wait_start(wait_event_info);
    2146 GIC      993512 :     returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
    2147          993512 :     pgstat_report_wait_end();
    2148                 : 
    2149                 :     /* if write didn't set errno, assume problem is no disk space */
    2150          993512 :     if (returnCode != amount && errno == 0)
    2151 UIC           0 :         errno = ENOSPC;
    2152                 : 
    2153 GIC      993512 :     if (returnCode >= 0)
    2154                 :     {
    2155 ECB             :         /*
    2156                 :          * Maintain fileSize and temporary_files_size if it's a temp file.
    2157                 :          */
    2158 GIC      993512 :         if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2159                 :         {
    2160 CBC       58193 :             off_t       past_write = offset + amount;
    2161                 : 
    2162 GIC       58193 :             if (past_write > vfdP->fileSize)
    2163                 :             {
    2164           39327 :                 temporary_files_size += past_write - vfdP->fileSize;
    2165           39327 :                 vfdP->fileSize = past_write;
    2166 ECB             :             }
    2167                 :         }
    2168 EUB             :     }
    2169                 :     else
    2170 ECB             :     {
    2171                 :         /*
    2172                 :          * See comments in FileRead()
    2173                 :          */
    2174                 : #ifdef WIN32
    2175 EUB             :         DWORD       error = GetLastError();
    2176 ECB             : 
    2177                 :         switch (error)
    2178                 :         {
    2179 EUB             :             case ERROR_NO_SYSTEM_RESOURCES:
    2180                 :                 pg_usleep(1000L);
    2181                 :                 errno = EINTR;
    2182                 :                 break;
    2183                 :             default:
    2184 ECB             :                 _dosmaperr(error);
    2185                 :                 break;
    2186                 :         }
    2187                 : #endif
    2188                 :         /* OK to retry if interrupted */
    2189 UIC           0 :         if (errno == EINTR)
    2190               0 :             goto retry;
    2191                 :     }
    2192                 : 
    2193 GIC      993512 :     return returnCode;
    2194                 : }
    2195                 : 
    2196                 : int
    2197           59883 : FileSync(File file, uint32 wait_event_info)
    2198                 : {
    2199                 :     int         returnCode;
    2200 ECB             : 
    2201 GIC       59883 :     Assert(FileIsValid(file));
    2202                 : 
    2203                 :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    2204                 :                file, VfdCache[file].fileName));
    2205 ECB             : 
    2206 GIC       59883 :     returnCode = FileAccess(file);
    2207           59883 :     if (returnCode < 0)
    2208 UIC           0 :         return returnCode;
    2209                 : 
    2210 GIC       59883 :     pgstat_report_wait_start(wait_event_info);
    2211 CBC       59883 :     returnCode = pg_fsync(VfdCache[file].fd);
    2212           59883 :     pgstat_report_wait_end();
    2213 EUB             : 
    2214 GIC       59883 :     return returnCode;
    2215 ECB             : }
    2216                 : 
    2217                 : /*
    2218                 :  * Zero a region of the file.
    2219                 :  *
    2220                 :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2221                 :  * appropriate error.
    2222                 :  */
    2223                 : int
    2224 GNC      343400 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
    2225                 : {
    2226                 :     int         returnCode;
    2227                 :     ssize_t     written;
    2228                 : 
    2229          343400 :     Assert(FileIsValid(file));
    2230                 : 
    2231                 :     DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2232                 :                file, VfdCache[file].fileName,
    2233                 :                (int64) offset, (int64) amount));
    2234                 : 
    2235          343400 :     returnCode = FileAccess(file);
    2236          343400 :     if (returnCode < 0)
    2237 UNC           0 :         return returnCode;
    2238                 : 
    2239 GNC      343400 :     pgstat_report_wait_start(wait_event_info);
    2240          343400 :     written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
    2241          343400 :     pgstat_report_wait_end();
    2242                 : 
    2243          343400 :     if (written < 0)
    2244 UNC           0 :         return -1;
    2245 GNC      343400 :     else if (written != amount)
    2246                 :     {
    2247                 :         /* if errno is unset, assume problem is no disk space */
    2248 UNC           0 :         if (errno == 0)
    2249               0 :             errno = ENOSPC;
    2250               0 :         return -1;
    2251                 :     }
    2252                 : 
    2253 GNC      343400 :     return 0;
    2254                 : }
    2255                 : 
    2256                 : /*
    2257                 :  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
    2258                 :  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
    2259                 :  * use FileZero() instead.
    2260                 :  *
    2261                 :  * Note that at least glibc() implements posix_fallocate() in userspace if not
    2262                 :  * implemented by the filesystem. That's not the case for all environments
    2263                 :  * though.
    2264                 :  *
    2265                 :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2266                 :  * appropriate error.
    2267                 :  */
    2268                 : int
    2269             385 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
    2270                 : {
    2271                 : #ifdef HAVE_POSIX_FALLOCATE
    2272                 :     int         returnCode;
    2273                 : 
    2274             385 :     Assert(FileIsValid(file));
    2275                 : 
    2276                 :     DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2277                 :                file, VfdCache[file].fileName,
    2278                 :                (int64) offset, (int64) amount));
    2279                 : 
    2280             385 :     returnCode = FileAccess(file);
    2281             385 :     if (returnCode < 0)
    2282 UNC           0 :         return -1;
    2283                 : 
    2284 GNC         385 :     pgstat_report_wait_start(wait_event_info);
    2285             385 :     returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
    2286             385 :     pgstat_report_wait_end();
    2287                 : 
    2288             385 :     if (returnCode == 0)
    2289             385 :         return 0;
    2290                 : 
    2291                 :     /* for compatibility with %m printing etc */
    2292 UNC           0 :     errno = returnCode;
    2293                 : 
    2294                 :     /*
    2295                 :      * Return in cases of a "real" failure, if fallocate is not supported,
    2296                 :      * fall through to the FileZero() backed implementation.
    2297                 :      */
    2298               0 :     if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
    2299               0 :         return -1;
    2300                 : #endif
    2301                 : 
    2302               0 :     return FileZero(file, offset, amount, wait_event_info);
    2303                 : }
    2304                 : 
    2305 ECB             : off_t
    2306 GIC     4252400 : FileSize(File file)
    2307 ECB             : {
    2308 CBC     4252400 :     Assert(FileIsValid(file));
    2309                 : 
    2310                 :     DO_DB(elog(LOG, "FileSize %d (%s)",
    2311 EUB             :                file, VfdCache[file].fileName));
    2312                 : 
    2313 GIC     4252400 :     if (FileIsNotOpen(file))
    2314                 :     {
    2315           79988 :         if (FileAccess(file) < 0)
    2316 UIC           0 :             return (off_t) -1;
    2317 EUB             :     }
    2318                 : 
    2319 GIC     4252400 :     return lseek(VfdCache[file].fd, 0, SEEK_END);
    2320                 : }
    2321 EUB             : 
    2322                 : int
    2323 GIC         396 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
    2324                 : {
    2325 ECB             :     int         returnCode;
    2326                 : 
    2327 CBC         396 :     Assert(FileIsValid(file));
    2328                 : 
    2329                 :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    2330                 :                file, VfdCache[file].fileName));
    2331                 : 
    2332             396 :     returnCode = FileAccess(file);
    2333 GIC         396 :     if (returnCode < 0)
    2334 LBC           0 :         return returnCode;
    2335 EUB             : 
    2336 GIC         396 :     pgstat_report_wait_start(wait_event_info);
    2337             396 :     returnCode = ftruncate(VfdCache[file].fd, offset);
    2338 CBC         396 :     pgstat_report_wait_end();
    2339                 : 
    2340 GIC         396 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    2341                 :     {
    2342 ECB             :         /* adjust our state for truncation of a temp file */
    2343 UIC           0 :         Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
    2344               0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    2345               0 :         VfdCache[file].fileSize = offset;
    2346 ECB             :     }
    2347                 : 
    2348 GIC         396 :     return returnCode;
    2349                 : }
    2350                 : 
    2351 ECB             : /*
    2352                 :  * Return the pathname associated with an open file.
    2353 EUB             :  *
    2354                 :  * The returned string points to an internal buffer, which is valid until
    2355 ECB             :  * the file is closed.
    2356                 :  */
    2357                 : char *
    2358 UIC           0 : FilePathName(File file)
    2359 ECB             : {
    2360 UIC           0 :     Assert(FileIsValid(file));
    2361                 : 
    2362 UBC           0 :     return VfdCache[file].fileName;
    2363 EUB             : }
    2364                 : 
    2365                 : /*
    2366                 :  * Return the raw file descriptor of an opened file.
    2367 ECB             :  *
    2368                 :  * The returned file descriptor will be valid until the file is closed, but
    2369                 :  * there are a lot of things that can make that happen.  So the caller should
    2370                 :  * be careful not to do much of anything else before it finishes using the
    2371                 :  * returned file descriptor.
    2372                 :  */
    2373                 : int
    2374 UIC           0 : FileGetRawDesc(File file)
    2375                 : {
    2376               0 :     Assert(FileIsValid(file));
    2377 UBC           0 :     return VfdCache[file].fd;
    2378                 : }
    2379 EUB             : 
    2380                 : /*
    2381                 :  * FileGetRawFlags - returns the file flags on open(2)
    2382                 :  */
    2383                 : int
    2384 UIC           0 : FileGetRawFlags(File file)
    2385                 : {
    2386               0 :     Assert(FileIsValid(file));
    2387               0 :     return VfdCache[file].fileFlags;
    2388                 : }
    2389                 : 
    2390                 : /*
    2391                 :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2392                 :  */
    2393 EUB             : mode_t
    2394 UIC           0 : FileGetRawMode(File file)
    2395 EUB             : {
    2396 UBC           0 :     Assert(FileIsValid(file));
    2397 UIC           0 :     return VfdCache[file].fileMode;
    2398                 : }
    2399                 : 
    2400                 : /*
    2401                 :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2402                 :  * Returns true if an array element is available.
    2403 EUB             :  */
    2404                 : static bool
    2405 GBC      704666 : reserveAllocatedDesc(void)
    2406 EUB             : {
    2407                 :     AllocateDesc *newDescs;
    2408                 :     int         newMax;
    2409                 : 
    2410                 :     /* Quick out if array already has a free slot. */
    2411 GIC      704666 :     if (numAllocatedDescs < maxAllocatedDescs)
    2412          702833 :         return true;
    2413 EUB             : 
    2414                 :     /*
    2415                 :      * If the array hasn't yet been created in the current process, initialize
    2416                 :      * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
    2417                 :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2418                 :      * immediately because set_max_safe_fds() may not have run yet.
    2419                 :      */
    2420 GIC        1833 :     if (allocatedDescs == NULL)
    2421                 :     {
    2422            1833 :         newMax = FD_MINFREE / 3;
    2423            1833 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2424 ECB             :         /* Out of memory already?  Treat as fatal error. */
    2425 GIC        1833 :         if (newDescs == NULL)
    2426 UIC           0 :             ereport(ERROR,
    2427                 :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2428                 :                      errmsg("out of memory")));
    2429 GIC        1833 :         allocatedDescs = newDescs;
    2430 CBC        1833 :         maxAllocatedDescs = newMax;
    2431            1833 :         return true;
    2432                 :     }
    2433                 : 
    2434                 :     /*
    2435                 :      * Consider enlarging the array beyond the initial allocation used above.
    2436                 :      * By the time this happens, max_safe_fds should be known accurately.
    2437                 :      *
    2438                 :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2439 ECB             :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2440                 :      * set the maximum to max_safe_fds / 3.  (This should certainly be at
    2441                 :      * least as large as the initial size, FD_MINFREE / 3, so we aren't
    2442                 :      * tightening the restriction here.)  Recall that "external" FDs are
    2443                 :      * allowed to consume another third of max_safe_fds.
    2444                 :      */
    2445 UBC           0 :     newMax = max_safe_fds / 3;
    2446 UIC           0 :     if (newMax > maxAllocatedDescs)
    2447                 :     {
    2448 LBC           0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2449 ECB             :                                             newMax * sizeof(AllocateDesc));
    2450                 :         /* Treat out-of-memory as a non-fatal error. */
    2451 UIC           0 :         if (newDescs == NULL)
    2452               0 :             return false;
    2453               0 :         allocatedDescs = newDescs;
    2454               0 :         maxAllocatedDescs = newMax;
    2455               0 :         return true;
    2456                 :     }
    2457                 : 
    2458                 :     /* Can't enlarge allocatedDescs[] any more. */
    2459               0 :     return false;
    2460                 : }
    2461                 : 
    2462                 : /*
    2463                 :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2464 EUB             :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2465                 :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2466                 :  *
    2467                 :  * Note that files that will be open for any significant length of time
    2468                 :  * should NOT be handled this way, since they cannot share kernel file
    2469                 :  * descriptors with other files; there is grave risk of running out of FDs
    2470                 :  * if anyone locks down too many FDs.  Most callers of this routine are
    2471                 :  * simply reading a config file that they will read and close immediately.
    2472                 :  *
    2473                 :  * fd.c will automatically close all files opened with AllocateFile at
    2474                 :  * transaction commit or abort; this prevents FD leakage if a routine
    2475                 :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2476                 :  *
    2477                 :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2478                 :  */
    2479                 : FILE *
    2480 GIC       63940 : AllocateFile(const char *name, const char *mode)
    2481                 : {
    2482                 :     FILE       *file;
    2483                 : 
    2484                 :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2485                 :                numAllocatedDescs, name));
    2486                 : 
    2487                 :     /* Can we allocate another non-virtual FD? */
    2488           63940 :     if (!reserveAllocatedDesc())
    2489 UIC           0 :         ereport(ERROR,
    2490                 :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2491                 :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2492                 :                         maxAllocatedDescs, name)));
    2493                 : 
    2494                 :     /* Close excess kernel FDs. */
    2495 GIC       63940 :     ReleaseLruFiles();
    2496                 : 
    2497           63940 : TryAgain:
    2498           63940 :     if ((file = fopen(name, mode)) != NULL)
    2499 ECB             :     {
    2500 GIC       57546 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2501                 : 
    2502           57546 :         desc->kind = AllocateDescFile;
    2503           57546 :         desc->desc.file = file;
    2504           57546 :         desc->create_subid = GetCurrentSubTransactionId();
    2505           57546 :         numAllocatedDescs++;
    2506           57546 :         return desc->desc.file;
    2507 ECB             :     }
    2508 EUB             : 
    2509 GIC        6394 :     if (errno == EMFILE || errno == ENFILE)
    2510                 :     {
    2511 UIC           0 :         int         save_errno = errno;
    2512                 : 
    2513               0 :         ereport(LOG,
    2514 ECB             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2515                 :                  errmsg("out of file descriptors: %m; release and retry")));
    2516 LBC           0 :         errno = 0;
    2517               0 :         if (ReleaseLruFile())
    2518 UIC           0 :             goto TryAgain;
    2519 LBC           0 :         errno = save_errno;
    2520                 :     }
    2521 ECB             : 
    2522 CBC        6394 :     return NULL;
    2523 ECB             : }
    2524                 : 
    2525                 : /*
    2526                 :  * Open a file with OpenTransientFilePerm() and pass default file mode for
    2527                 :  * the fileMode parameter.
    2528                 :  */
    2529                 : int
    2530 GBC      599546 : OpenTransientFile(const char *fileName, int fileFlags)
    2531                 : {
    2532          599546 :     return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
    2533                 : }
    2534                 : 
    2535 EUB             : /*
    2536                 :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2537                 :  */
    2538                 : int
    2539 GIC      599552 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    2540                 : {
    2541 ECB             :     int         fd;
    2542                 : 
    2543                 :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2544                 :                numAllocatedDescs, fileName));
    2545                 : 
    2546                 :     /* Can we allocate another non-virtual FD? */
    2547 GIC      599552 :     if (!reserveAllocatedDesc())
    2548 UIC           0 :         ereport(ERROR,
    2549 ECB             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2550                 :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2551                 :                         maxAllocatedDescs, fileName)));
    2552                 : 
    2553                 :     /* Close excess kernel FDs. */
    2554 GIC      599552 :     ReleaseLruFiles();
    2555                 : 
    2556          599552 :     fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    2557                 : 
    2558 CBC      599552 :     if (fd >= 0)
    2559                 :     {
    2560 GIC      596644 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2561                 : 
    2562          596644 :         desc->kind = AllocateDescRawFD;
    2563          596644 :         desc->desc.fd = fd;
    2564          596644 :         desc->create_subid = GetCurrentSubTransactionId();
    2565          596644 :         numAllocatedDescs++;
    2566 ECB             : 
    2567 GBC      596644 :         return fd;
    2568                 :     }
    2569                 : 
    2570 GIC        2908 :     return -1;                  /* failure */
    2571                 : }
    2572                 : 
    2573 ECB             : /*
    2574                 :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2575                 :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2576                 :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2577                 :  *
    2578                 :  * This function also ensures that the popen'd program is run with default
    2579                 :  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
    2580                 :  * uses.  This ensures desirable response to, eg, closing a read pipe early.
    2581                 :  */
    2582                 : FILE *
    2583 CBC         311 : OpenPipeStream(const char *command, const char *mode)
    2584 ECB             : {
    2585                 :     FILE       *file;
    2586                 :     int         save_errno;
    2587                 : 
    2588                 :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2589                 :                numAllocatedDescs, command));
    2590                 : 
    2591                 :     /* Can we allocate another non-virtual FD? */
    2592 GIC         311 :     if (!reserveAllocatedDesc())
    2593 UIC           0 :         ereport(ERROR,
    2594                 :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2595                 :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2596                 :                         maxAllocatedDescs, command)));
    2597                 : 
    2598                 :     /* Close excess kernel FDs. */
    2599 GIC         311 :     ReleaseLruFiles();
    2600                 : 
    2601             311 : TryAgain:
    2602 GNC         311 :     fflush(NULL);
    2603 GIC         311 :     pqsignal(SIGPIPE, SIG_DFL);
    2604             311 :     errno = 0;
    2605             311 :     file = popen(command, mode);
    2606             311 :     save_errno = errno;
    2607             311 :     pqsignal(SIGPIPE, SIG_IGN);
    2608             311 :     errno = save_errno;
    2609             311 :     if (file != NULL)
    2610 ECB             :     {
    2611 GBC         311 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2612                 : 
    2613 GIC         311 :         desc->kind = AllocateDescPipe;
    2614             311 :         desc->desc.file = file;
    2615             311 :         desc->create_subid = GetCurrentSubTransactionId();
    2616             311 :         numAllocatedDescs++;
    2617 CBC         311 :         return desc->desc.file;
    2618                 :     }
    2619 ECB             : 
    2620 LBC           0 :     if (errno == EMFILE || errno == ENFILE)
    2621 ECB             :     {
    2622 LBC           0 :         ereport(LOG,
    2623 ECB             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2624                 :                  errmsg("out of file descriptors: %m; release and retry")));
    2625 LBC           0 :         if (ReleaseLruFile())
    2626               0 :             goto TryAgain;
    2627               0 :         errno = save_errno;
    2628                 :     }
    2629 ECB             : 
    2630 UIC           0 :     return NULL;
    2631 ECB             : }
    2632                 : 
    2633                 : /*
    2634                 :  * Free an AllocateDesc of any type.
    2635                 :  *
    2636                 :  * The argument *must* point into the allocatedDescs[] array.
    2637                 :  */
    2638 EUB             : static int
    2639 GIC      694748 : FreeDesc(AllocateDesc *desc)
    2640 EUB             : {
    2641                 :     int         result;
    2642                 : 
    2643                 :     /* Close the underlying object */
    2644 GBC      694748 :     switch (desc->kind)
    2645 EUB             :     {
    2646 GIC       57546 :         case AllocateDescFile:
    2647           57546 :             result = fclose(desc->desc.file);
    2648 GBC       57546 :             break;
    2649 GIC         311 :         case AllocateDescPipe:
    2650             311 :             result = pclose(desc->desc.file);
    2651             311 :             break;
    2652           40247 :         case AllocateDescDir:
    2653           40247 :             result = closedir(desc->desc.dir);
    2654           40247 :             break;
    2655          596644 :         case AllocateDescRawFD:
    2656          596644 :             result = close(desc->desc.fd);
    2657 CBC      596644 :             break;
    2658 UIC           0 :         default:
    2659               0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2660                 :             result = 0;         /* keep compiler quiet */
    2661                 :             break;
    2662 ECB             :     }
    2663                 : 
    2664                 :     /* Compact storage in the allocatedDescs array */
    2665 CBC      694748 :     numAllocatedDescs--;
    2666          694748 :     *desc = allocatedDescs[numAllocatedDescs];
    2667 ECB             : 
    2668 CBC      694748 :     return result;
    2669 ECB             : }
    2670                 : 
    2671                 : /*
    2672                 :  * Close a file returned by AllocateFile.
    2673                 :  *
    2674                 :  * Note we do not check fclose's return value --- it is up to the caller
    2675                 :  * to handle close errors.
    2676 EUB             :  */
    2677                 : int
    2678 GIC       57537 : FreeFile(FILE *file)
    2679                 : {
    2680                 :     int         i;
    2681                 : 
    2682                 :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2683 ECB             : 
    2684                 :     /* Remove file from list of allocated files, if it's present */
    2685 GIC       57538 :     for (i = numAllocatedDescs; --i >= 0;)
    2686 ECB             :     {
    2687 GIC       57538 :         AllocateDesc *desc = &allocatedDescs[i];
    2688                 : 
    2689           57538 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2690           57537 :             return FreeDesc(desc);
    2691                 :     }
    2692                 : 
    2693                 :     /* Only get here if someone passes us a file not in allocatedDescs */
    2694 UIC           0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2695                 : 
    2696 LBC           0 :     return fclose(file);
    2697                 : }
    2698                 : 
    2699                 : /*
    2700                 :  * Close a file returned by OpenTransientFile.
    2701                 :  *
    2702                 :  * Note we do not check close's return value --- it is up to the caller
    2703 ECB             :  * to handle close errors.
    2704                 :  */
    2705                 : int
    2706 GIC      596643 : CloseTransientFile(int fd)
    2707 ECB             : {
    2708                 :     int         i;
    2709                 : 
    2710                 :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2711                 : 
    2712 EUB             :     /* Remove fd from list of allocated files, if it's present */
    2713 GIC      596643 :     for (i = numAllocatedDescs; --i >= 0;)
    2714 EUB             :     {
    2715 GIC      596643 :         AllocateDesc *desc = &allocatedDescs[i];
    2716                 : 
    2717          596643 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2718          596643 :             return FreeDesc(desc);
    2719                 :     }
    2720                 : 
    2721                 :     /* Only get here if someone passes us a file not in allocatedDescs */
    2722 UIC           0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2723                 : 
    2724 LBC           0 :     return close(fd);
    2725                 : }
    2726                 : 
    2727                 : /*
    2728                 :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2729                 :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2730                 :  * necessary to open the directory, and with closing it after an elog.
    2731 ECB             :  * When done, call FreeDir rather than closedir.
    2732                 :  *
    2733                 :  * Returns NULL, with errno set, on failure.  Note that failure detection
    2734                 :  * is commonly left to the following call of ReadDir or ReadDirExtended;
    2735                 :  * see the comments for ReadDir.
    2736                 :  *
    2737                 :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2738                 :  */
    2739                 : DIR *
    2740 GBC       40863 : AllocateDir(const char *dirname)
    2741                 : {
    2742 EUB             :     DIR        *dir;
    2743                 : 
    2744                 :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2745                 :                numAllocatedDescs, dirname));
    2746                 : 
    2747                 :     /* Can we allocate another non-virtual FD? */
    2748 GIC       40863 :     if (!reserveAllocatedDesc())
    2749 UIC           0 :         ereport(ERROR,
    2750                 :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2751                 :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2752                 :                         maxAllocatedDescs, dirname)));
    2753                 : 
    2754                 :     /* Close excess kernel FDs. */
    2755 GIC       40863 :     ReleaseLruFiles();
    2756                 : 
    2757           40863 : TryAgain:
    2758 CBC       40863 :     if ((dir = opendir(dirname)) != NULL)
    2759                 :     {
    2760 GIC       40247 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2761                 : 
    2762           40247 :         desc->kind = AllocateDescDir;
    2763           40247 :         desc->desc.dir = dir;
    2764           40247 :         desc->create_subid = GetCurrentSubTransactionId();
    2765           40247 :         numAllocatedDescs++;
    2766 CBC       40247 :         return desc->desc.dir;
    2767 EUB             :     }
    2768                 : 
    2769 GIC         616 :     if (errno == EMFILE || errno == ENFILE)
    2770                 :     {
    2771 UIC           0 :         int         save_errno = errno;
    2772                 : 
    2773 LBC           0 :         ereport(LOG,
    2774                 :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2775 ECB             :                  errmsg("out of file descriptors: %m; release and retry")));
    2776 LBC           0 :         errno = 0;
    2777 UIC           0 :         if (ReleaseLruFile())
    2778 LBC           0 :             goto TryAgain;
    2779 UIC           0 :         errno = save_errno;
    2780 ECB             :     }
    2781                 : 
    2782 CBC         616 :     return NULL;
    2783 ECB             : }
    2784                 : 
    2785                 : /*
    2786                 :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2787                 :  *
    2788                 :  * This is easier to use than raw readdir() since it takes care of some
    2789 EUB             :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2790                 :  * if you are happy with a generic error message for AllocateDir failure,
    2791                 :  * you can just do
    2792                 :  *
    2793                 :  *      dir = AllocateDir(path);
    2794                 :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2795                 :  *          process dirent;
    2796                 :  *      FreeDir(dir);
    2797                 :  *
    2798                 :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2799                 :  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
    2800 ECB             :  * use this shortcut.)
    2801                 :  *
    2802                 :  * The pathname passed to AllocateDir must be passed to this routine too,
    2803                 :  * but it is only used for error reporting.
    2804                 :  */
    2805                 : struct dirent *
    2806 GIC     1011151 : ReadDir(DIR *dir, const char *dirname)
    2807                 : {
    2808         1011151 :     return ReadDirExtended(dir, dirname, ERROR);
    2809                 : }
    2810                 : 
    2811                 : /*
    2812                 :  * Alternate version of ReadDir that allows caller to specify the elevel
    2813                 :  * for any error report (whether it's reporting an initial failure of
    2814                 :  * AllocateDir or a subsequent directory read failure).
    2815                 :  *
    2816                 :  * If elevel < ERROR, returns NULL after any error.  With the normal coding
    2817                 :  * pattern, this will result in falling out of the loop immediately as
    2818                 :  * though the directory contained no (more) entries.
    2819                 :  */
    2820                 : struct dirent *
    2821         1898454 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2822                 : {
    2823                 :     struct dirent *dent;
    2824 ECB             : 
    2825                 :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2826 CBC     1898454 :     if (dir == NULL)
    2827                 :     {
    2828 GIC           3 :         ereport(elevel,
    2829                 :                 (errcode_for_file_access(),
    2830                 :                  errmsg("could not open directory \"%s\": %m",
    2831                 :                         dirname)));
    2832 UIC           0 :         return NULL;
    2833                 :     }
    2834                 : 
    2835 GIC     1898451 :     errno = 0;
    2836         1898451 :     if ((dent = readdir(dir)) != NULL)
    2837         1865925 :         return dent;
    2838                 : 
    2839 CBC       32526 :     if (errno)
    2840 UIC           0 :         ereport(elevel,
    2841                 :                 (errcode_for_file_access(),
    2842                 :                  errmsg("could not read directory \"%s\": %m",
    2843                 :                         dirname)));
    2844 CBC       32526 :     return NULL;
    2845                 : }
    2846 ECB             : 
    2847                 : /*
    2848                 :  * Close a directory opened with AllocateDir.
    2849                 :  *
    2850 EUB             :  * Returns closedir's return value (with errno set if it's not 0).
    2851                 :  * Note we do not check the return value --- it is up to the caller
    2852                 :  * to handle close errors if wanted.
    2853 ECB             :  *
    2854                 :  * Does nothing if dir == NULL; we assume that directory open failure was
    2855                 :  * already reported if desired.
    2856                 :  */
    2857                 : int
    2858 GBC       40158 : FreeDir(DIR *dir)
    2859                 : {
    2860                 :     int         i;
    2861                 : 
    2862 ECB             :     /* Nothing to do if AllocateDir failed */
    2863 GIC       40158 :     if (dir == NULL)
    2864 UIC           0 :         return 0;
    2865                 : 
    2866                 :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    2867                 : 
    2868                 :     /* Remove dir from list of allocated dirs, if it's present */
    2869 GIC       40158 :     for (i = numAllocatedDescs; --i >= 0;)
    2870                 :     {
    2871           40158 :         AllocateDesc *desc = &allocatedDescs[i];
    2872                 : 
    2873           40158 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    2874           40158 :             return FreeDesc(desc);
    2875                 :     }
    2876 ECB             : 
    2877                 :     /* Only get here if someone passes us a dir not in allocatedDescs */
    2878 UIC           0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    2879                 : 
    2880               0 :     return closedir(dir);
    2881 ECB             : }
    2882 EUB             : 
    2883                 : 
    2884                 : /*
    2885                 :  * Close a pipe stream returned by OpenPipeStream.
    2886                 :  */
    2887 ECB             : int
    2888 GIC         311 : ClosePipeStream(FILE *file)
    2889 ECB             : {
    2890                 :     int         i;
    2891                 : 
    2892                 :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    2893                 : 
    2894                 :     /* Remove file from list of allocated files, if it's present */
    2895 GIC         311 :     for (i = numAllocatedDescs; --i >= 0;)
    2896 EUB             :     {
    2897 GIC         311 :         AllocateDesc *desc = &allocatedDescs[i];
    2898 EUB             : 
    2899 GIC         311 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    2900             311 :             return FreeDesc(desc);
    2901                 :     }
    2902                 : 
    2903                 :     /* Only get here if someone passes us a file not in allocatedDescs */
    2904 UIC           0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    2905                 : 
    2906 LBC           0 :     return pclose(file);
    2907                 : }
    2908                 : 
    2909                 : /*
    2910                 :  * closeAllVfds
    2911                 :  *
    2912                 :  * Force all VFDs into the physically-closed state, so that the fewest
    2913 ECB             :  * possible number of kernel file descriptors are in use.  There is no
    2914                 :  * change in the logical state of the VFDs.
    2915                 :  */
    2916                 : void
    2917 CBC          26 : closeAllVfds(void)
    2918 ECB             : {
    2919                 :     Index       i;
    2920                 : 
    2921 GIC          26 :     if (SizeVfdCache > 0)
    2922 EUB             :     {
    2923 GIC          26 :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2924 GBC         832 :         for (i = 1; i < SizeVfdCache; i++)
    2925                 :         {
    2926 GIC         806 :             if (!FileIsNotOpen(i))
    2927              64 :                 LruDelete(i);
    2928                 :         }
    2929                 :     }
    2930              26 : }
    2931                 : 
    2932                 : 
    2933                 : /*
    2934                 :  * SetTempTablespaces
    2935 ECB             :  *
    2936                 :  * Define a list (actually an array) of OIDs of tablespaces to use for
    2937                 :  * temporary files.  This list will be used until end of transaction,
    2938                 :  * unless this function is called again before then.  It is caller's
    2939                 :  * responsibility that the passed-in array has adequate lifespan (typically
    2940                 :  * it'd be allocated in TopTransactionContext).
    2941                 :  *
    2942                 :  * Some entries of the array may be InvalidOid, indicating that the current
    2943                 :  * database's default tablespace should be used.
    2944                 :  */
    2945                 : void
    2946 GIC        3645 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2947                 : {
    2948 CBC        3645 :     Assert(numSpaces >= 0);
    2949 GIC        3645 :     tempTableSpaces = tableSpaces;
    2950            3645 :     numTempTableSpaces = numSpaces;
    2951                 : 
    2952                 :     /*
    2953                 :      * Select a random starting point in the list.  This is to minimize
    2954                 :      * conflicts between backends that are most likely sharing the same list
    2955                 :      * of temp tablespaces.  Note that if we create multiple temp files in the
    2956                 :      * same transaction, we'll advance circularly through the list --- this
    2957                 :      * ensures that large temporary sort files are nicely spread across all
    2958                 :      * available tablespaces.
    2959                 :      */
    2960            3645 :     if (numSpaces > 1)
    2961 UIC           0 :         nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
    2962               0 :                                                   0, numSpaces - 1);
    2963                 :     else
    2964 CBC        3645 :         nextTempTableSpace = 0;
    2965 GIC        3645 : }
    2966 ECB             : 
    2967                 : /*
    2968                 :  * TempTablespacesAreSet
    2969                 :  *
    2970                 :  * Returns true if SetTempTablespaces has been called in current transaction.
    2971                 :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    2972                 :  * state.)
    2973                 :  */
    2974                 : bool
    2975 GIC        4525 : TempTablespacesAreSet(void)
    2976                 : {
    2977            4525 :     return (numTempTableSpaces >= 0);
    2978 ECB             : }
    2979 EUB             : 
    2980                 : /*
    2981                 :  * GetTempTablespaces
    2982 ECB             :  *
    2983                 :  * Populate an array with the OIDs of the tablespaces that should be used for
    2984                 :  * temporary files.  (Some entries may be InvalidOid, indicating that the
    2985                 :  * current database's default tablespace should be used.)  At most numSpaces
    2986                 :  * entries will be filled.
    2987                 :  * Returns the number of OIDs that were copied into the output array.
    2988                 :  */
    2989                 : int
    2990 GIC         174 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2991                 : {
    2992                 :     int         i;
    2993 ECB             : 
    2994 GIC         174 :     Assert(TempTablespacesAreSet());
    2995 CBC         174 :     for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
    2996 UIC           0 :         tableSpaces[i] = tempTableSpaces[i];
    2997                 : 
    2998 GIC         174 :     return i;
    2999                 : }
    3000                 : 
    3001                 : /*
    3002                 :  * GetNextTempTableSpace
    3003                 :  *
    3004                 :  * Select the next temp tablespace to use.  A result of InvalidOid means
    3005                 :  * to use the current database's default tablespace.
    3006                 :  */
    3007                 : Oid
    3008 CBC        1896 : GetNextTempTableSpace(void)
    3009                 : {
    3010 GIC        1896 :     if (numTempTableSpaces > 0)
    3011                 :     {
    3012 ECB             :         /* Advance nextTempTableSpace counter with wraparound */
    3013 CBC           1 :         if (++nextTempTableSpace >= numTempTableSpaces)
    3014 GBC           1 :             nextTempTableSpace = 0;
    3015 GIC           1 :         return tempTableSpaces[nextTempTableSpace];
    3016 ECB             :     }
    3017 GIC        1895 :     return InvalidOid;
    3018                 : }
    3019                 : 
    3020                 : 
    3021                 : /*
    3022                 :  * AtEOSubXact_Files
    3023                 :  *
    3024                 :  * Take care of subtransaction commit/abort.  At abort, we close temp files
    3025                 :  * that the subtransaction may have opened.  At commit, we reassign the
    3026 ECB             :  * files that were opened to the parent subtransaction.
    3027                 :  */
    3028                 : void
    3029 GIC        8795 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    3030                 :                   SubTransactionId parentSubid)
    3031 ECB             : {
    3032                 :     Index       i;
    3033                 : 
    3034 GIC        8795 :     for (i = 0; i < numAllocatedDescs; i++)
    3035 ECB             :     {
    3036 UIC           0 :         if (allocatedDescs[i].create_subid == mySubid)
    3037                 :         {
    3038               0 :             if (isCommit)
    3039               0 :                 allocatedDescs[i].create_subid = parentSubid;
    3040                 :             else
    3041                 :             {
    3042                 :                 /* have to recheck the item after FreeDesc (ugly) */
    3043               0 :                 FreeDesc(&allocatedDescs[i--]);
    3044                 :             }
    3045                 :         }
    3046                 :     }
    3047 CBC        8795 : }
    3048                 : 
    3049                 : /*
    3050                 :  * AtEOXact_Files
    3051                 :  *
    3052 ECB             :  * This routine is called during transaction commit or abort.  All still-open
    3053                 :  * per-transaction temporary file VFDs are closed, which also causes the
    3054 EUB             :  * underlying files to be deleted (although they should've been closed already
    3055                 :  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
    3056                 :  * closed. We also forget any transaction-local temp tablespace list.
    3057                 :  *
    3058                 :  * The isCommit flag is used only to decide whether to emit warnings about
    3059                 :  * unclosed files.
    3060                 :  */
    3061                 : void
    3062 GIC      486167 : AtEOXact_Files(bool isCommit)
    3063                 : {
    3064          486167 :     CleanupTempFiles(isCommit, false);
    3065 CBC      486167 :     tempTableSpaces = NULL;
    3066 GIC      486167 :     numTempTableSpaces = -1;
    3067          486167 : }
    3068                 : 
    3069                 : /*
    3070                 :  * BeforeShmemExit_Files
    3071                 :  *
    3072                 :  * before_shmem_access hook to clean up temp files during backend shutdown.
    3073                 :  * Here, we want to clean up *all* temp files including interXact ones.
    3074                 :  */
    3075                 : static void
    3076           13291 : BeforeShmemExit_Files(int code, Datum arg)
    3077                 : {
    3078           13291 :     CleanupTempFiles(false, true);
    3079                 : 
    3080 ECB             :     /* prevent further temp files from being created */
    3081                 : #ifdef USE_ASSERT_CHECKING
    3082 CBC       13291 :     temporary_files_allowed = false;
    3083 ECB             : #endif
    3084 CBC       13291 : }
    3085 ECB             : 
    3086                 : /*
    3087                 :  * Close temporary files and delete their underlying files.
    3088                 :  *
    3089                 :  * isCommit: if true, this is normal transaction commit, and we don't
    3090                 :  * expect any remaining files; warn if there are some.
    3091                 :  *
    3092                 :  * isProcExit: if true, this is being called as the backend process is
    3093                 :  * exiting. If that's the case, we should remove all temporary files; if
    3094                 :  * that's not the case, we are being called for transaction commit/abort
    3095                 :  * and should only remove transaction-local temp files.  In either case,
    3096                 :  * also clean up "allocated" stdio files, dirs and fds.
    3097                 :  */
    3098                 : static void
    3099 GIC      499458 : CleanupTempFiles(bool isCommit, bool isProcExit)
    3100 ECB             : {
    3101                 :     Index       i;
    3102                 : 
    3103                 :     /*
    3104                 :      * Careful here: at proc_exit we need extra cleanup, not just
    3105                 :      * xact_temporary files.
    3106                 :      */
    3107 GIC      499458 :     if (isProcExit || have_xact_temporary_files)
    3108                 :     {
    3109           14042 :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3110          843516 :         for (i = 1; i < SizeVfdCache; i++)
    3111                 :         {
    3112          829474 :             unsigned short fdstate = VfdCache[i].fdstate;
    3113                 : 
    3114          829474 :             if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
    3115               5 :                 VfdCache[i].fileName != NULL)
    3116                 :             {
    3117 ECB             :                 /*
    3118                 :                  * If we're in the process of exiting a backend process, close
    3119                 :                  * all temporary files. Otherwise, only close temporary files
    3120                 :                  * local to the current transaction. They should be closed by
    3121                 :                  * the ResourceOwner mechanism already, so this is just a
    3122                 :                  * debugging cross-check.
    3123                 :                  */
    3124 GIC           5 :                 if (isProcExit)
    3125 CBC           5 :                     FileClose(i);
    3126 UIC           0 :                 else if (fdstate & FD_CLOSE_AT_EOXACT)
    3127 ECB             :                 {
    3128 LBC           0 :                     elog(WARNING,
    3129                 :                          "temporary file %s not closed at end-of-transaction",
    3130 ECB             :                          VfdCache[i].fileName);
    3131 UIC           0 :                     FileClose(i);
    3132 ECB             :                 }
    3133                 :             }
    3134                 :         }
    3135                 : 
    3136 GIC       14042 :         have_xact_temporary_files = false;
    3137                 :     }
    3138                 : 
    3139                 :     /* Complain if any allocated files remain open at commit. */
    3140          499458 :     if (isCommit && numAllocatedDescs > 0)
    3141 UIC           0 :         elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
    3142 ECB             :              numAllocatedDescs);
    3143                 : 
    3144 EUB             :     /* Clean up "allocated" stdio files, dirs and fds. */
    3145 GIC      499557 :     while (numAllocatedDescs > 0)
    3146 GBC          99 :         FreeDesc(&allocatedDescs[0]);
    3147 GIC      499458 : }
    3148                 : 
    3149 EUB             : 
    3150                 : /*
    3151                 :  * Remove temporary and temporary relation files left over from a prior
    3152                 :  * postmaster session
    3153                 :  *
    3154 ECB             :  * This should be called during postmaster startup.  It will forcibly
    3155                 :  * remove any leftover files created by OpenTemporaryFile and any leftover
    3156                 :  * temporary relation files created by mdcreate.
    3157                 :  *
    3158                 :  * During post-backend-crash restart cycle, this routine is called when
    3159 EUB             :  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
    3160                 :  * queries are using temp files could result in useless storage usage that can
    3161                 :  * only be reclaimed by a service restart. The argument against enabling it is
    3162                 :  * that someone might want to examine the temporary files for debugging
    3163 ECB             :  * purposes. This does however mean that OpenTemporaryFile had better allow for
    3164                 :  * collision with an existing temp file name.
    3165                 :  *
    3166                 :  * NOTE: this function and its subroutines generally report syscall failures
    3167                 :  * with ereport(LOG) and keep going.  Removing temp files is not so critical
    3168                 :  * that we should fail to start the database when we can't do it.
    3169                 :  */
    3170                 : void
    3171 GIC         596 : RemovePgTempFiles(void)
    3172                 : {
    3173                 :     char        temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    3174                 :     DIR        *spc_dir;
    3175                 :     struct dirent *spc_de;
    3176                 : 
    3177                 :     /*
    3178                 :      * First process temp files in pg_default ($PGDATA/base)
    3179                 :      */
    3180             596 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    3181             596 :     RemovePgTempFilesInDir(temp_path, true, false);
    3182             596 :     RemovePgTempRelationFiles("base");
    3183                 : 
    3184                 :     /*
    3185                 :      * Cycle through temp directories for all non-default tablespaces.
    3186                 :      */
    3187             596 :     spc_dir = AllocateDir("pg_tblspc");
    3188                 : 
    3189 CBC        1839 :     while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
    3190                 :     {
    3191 GIC        1243 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    3192             647 :             strcmp(spc_de->d_name, "..") == 0)
    3193            1192 :             continue;
    3194                 : 
    3195              51 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
    3196              51 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
    3197              51 :         RemovePgTempFilesInDir(temp_path, true, false);
    3198 ECB             : 
    3199 CBC          51 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
    3200              51 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    3201 GIC          51 :         RemovePgTempRelationFiles(temp_path);
    3202                 :     }
    3203                 : 
    3204             596 :     FreeDir(spc_dir);
    3205 ECB             : 
    3206                 :     /*
    3207                 :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    3208                 :      * DataDir as well.  However, that is *not* cleaned here because doing so
    3209                 :      * would create a race condition.  It's done separately, earlier in
    3210                 :      * postmaster startup.
    3211                 :      */
    3212 GIC         596 : }
    3213 ECB             : 
    3214                 : /*
    3215                 :  * Process one pgsql_tmp directory for RemovePgTempFiles.
    3216                 :  *
    3217                 :  * If missing_ok is true, it's all right for the named directory to not exist.
    3218                 :  * Any other problem results in a LOG message.  (missing_ok should be true at
    3219                 :  * the top level, since pgsql_tmp directories are not created until needed.)
    3220                 :  *
    3221                 :  * At the top level, this should be called with unlink_all = false, so that
    3222                 :  * only files matching the temporary name prefix will be unlinked.  When
    3223                 :  * recursing it will be called with unlink_all = true to unlink everything
    3224                 :  * under a top-level temporary directory.
    3225                 :  *
    3226                 :  * (These two flags could be replaced by one, but it seems clearer to keep
    3227                 :  * them separate.)
    3228                 :  */
    3229                 : void
    3230 CBC         648 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
    3231                 : {
    3232                 :     DIR        *temp_dir;
    3233                 :     struct dirent *temp_de;
    3234                 :     char        rm_path[MAXPGPATH * 2];
    3235                 : 
    3236 GIC         648 :     temp_dir = AllocateDir(tmpdirname);
    3237                 : 
    3238             648 :     if (temp_dir == NULL && errno == ENOENT && missing_ok)
    3239             603 :         return;
    3240                 : 
    3241             138 :     while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
    3242                 :     {
    3243              93 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    3244              48 :             strcmp(temp_de->d_name, "..") == 0)
    3245              90 :             continue;
    3246                 : 
    3247               3 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3248 CBC           3 :                  tmpdirname, temp_de->d_name);
    3249                 : 
    3250 GIC           3 :         if (unlink_all ||
    3251               3 :             strncmp(temp_de->d_name,
    3252                 :                     PG_TEMP_FILE_PREFIX,
    3253                 :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    3254 CBC           3 :         {
    3255 GNC           3 :             PGFileType  type = get_dirent_type(rm_path, temp_de, false, LOG);
    3256 ECB             : 
    3257 GNC           3 :             if (type == PGFILETYPE_ERROR)
    3258 LBC           0 :                 continue;
    3259 GNC           3 :             else if (type == PGFILETYPE_DIR)
    3260 ECB             :             {
    3261                 :                 /* recursively remove contents, then directory itself */
    3262 CBC           1 :                 RemovePgTempFilesInDir(rm_path, false, true);
    3263 ECB             : 
    3264 GIC           1 :                 if (rmdir(rm_path) < 0)
    3265 UIC           0 :                     ereport(LOG,
    3266 ECB             :                             (errcode_for_file_access(),
    3267                 :                              errmsg("could not remove directory \"%s\": %m",
    3268                 :                                     rm_path)));
    3269                 :             }
    3270 EUB             :             else
    3271 ECB             :             {
    3272 GIC           2 :                 if (unlink(rm_path) < 0)
    3273 UIC           0 :                     ereport(LOG,
    3274 ECB             :                             (errcode_for_file_access(),
    3275                 :                              errmsg("could not remove file \"%s\": %m",
    3276                 :                                     rm_path)));
    3277 EUB             :             }
    3278                 :         }
    3279                 :         else
    3280 UIC           0 :             ereport(LOG,
    3281                 :                     (errmsg("unexpected file found in temporary-files directory: \"%s\"",
    3282                 :                             rm_path)));
    3283                 :     }
    3284 ECB             : 
    3285 GBC          45 :     FreeDir(temp_dir);
    3286                 : }
    3287                 : 
    3288                 : /* Process one tablespace directory, look for per-DB subdirectories */
    3289                 : static void
    3290 GIC         647 : RemovePgTempRelationFiles(const char *tsdirname)
    3291                 : {
    3292 EUB             :     DIR        *ts_dir;
    3293                 :     struct dirent *de;
    3294                 :     char        dbspace_path[MAXPGPATH * 2];
    3295                 : 
    3296 GIC         647 :     ts_dir = AllocateDir(tsdirname);
    3297 ECB             : 
    3298 GIC        4011 :     while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
    3299                 :     {
    3300                 :         /*
    3301                 :          * We're only interested in the per-database directories, which have
    3302 ECB             :          * numeric names.  Note that this code will also (properly) ignore "."
    3303                 :          * and "..".
    3304                 :          */
    3305 GIC        3364 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    3306            1338 :             continue;
    3307                 : 
    3308 CBC        2026 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    3309 GIC        2026 :                  tsdirname, de->d_name);
    3310 CBC        2026 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    3311                 :     }
    3312                 : 
    3313 GIC         647 :     FreeDir(ts_dir);
    3314             647 : }
    3315                 : 
    3316                 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    3317 ECB             : static void
    3318 CBC        2026 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    3319                 : {
    3320 ECB             :     DIR        *dbspace_dir;
    3321                 :     struct dirent *de;
    3322                 :     char        rm_path[MAXPGPATH * 2];
    3323                 : 
    3324 GIC        2026 :     dbspace_dir = AllocateDir(dbspacedirname);
    3325 ECB             : 
    3326 CBC      607643 :     while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
    3327                 :     {
    3328 GIC      605617 :         if (!looks_like_temp_rel_name(de->d_name))
    3329          605608 :             continue;
    3330 ECB             : 
    3331 GIC           9 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3332               9 :                  dbspacedirname, de->d_name);
    3333                 : 
    3334               9 :         if (unlink(rm_path) < 0)
    3335 UIC           0 :             ereport(LOG,
    3336 ECB             :                     (errcode_for_file_access(),
    3337                 :                      errmsg("could not remove file \"%s\": %m",
    3338                 :                             rm_path)));
    3339                 :     }
    3340                 : 
    3341 CBC        2026 :     FreeDir(dbspace_dir);
    3342 GIC        2026 : }
    3343 ECB             : 
    3344                 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    3345                 : bool
    3346 CBC      831248 : looks_like_temp_rel_name(const char *name)
    3347 EUB             : {
    3348                 :     int         pos;
    3349                 :     int         savepos;
    3350                 : 
    3351                 :     /* Must start with "t". */
    3352 GIC      831248 :     if (name[0] != 't')
    3353 CBC      831199 :         return false;
    3354 ECB             : 
    3355                 :     /* Followed by a non-empty string of digits and then an underscore. */
    3356 GIC         225 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    3357                 :         ;
    3358 CBC          49 :     if (pos == 1 || name[pos] != '_')
    3359 UIC           0 :         return false;
    3360                 : 
    3361                 :     /* Followed by another nonempty string of digits. */
    3362 GIC         250 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    3363                 :         ;
    3364 CBC          49 :     if (savepos == pos)
    3365 LBC           0 :         return false;
    3366                 : 
    3367                 :     /* We might have _forkname or .segment or both. */
    3368 CBC          49 :     if (name[pos] == '_')
    3369                 :     {
    3370              22 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    3371 EUB             : 
    3372 GIC          22 :         if (forkchar <= 0)
    3373 UIC           0 :             return false;
    3374 CBC          22 :         pos += forkchar + 1;
    3375                 :     }
    3376              49 :     if (name[pos] == '.')
    3377 EUB             :     {
    3378                 :         int         segchar;
    3379                 : 
    3380 CBC          44 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    3381                 :             ;
    3382              22 :         if (segchar <= 1)
    3383 UIC           0 :             return false;
    3384 CBC          22 :         pos += segchar;
    3385 EUB             :     }
    3386 ECB             : 
    3387                 :     /* Now we should be at the end. */
    3388 CBC          49 :     if (name[pos] != '\0')
    3389 UIC           0 :         return false;
    3390 GIC          49 :     return true;
    3391                 : }
    3392 ECB             : 
    3393                 : #ifdef HAVE_SYNCFS
    3394                 : static void
    3395 UBC           0 : do_syncfs(const char *path)
    3396 ECB             : {
    3397                 :     int         fd;
    3398                 : 
    3399 UIC           0 :     ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
    3400 ECB             :                              path);
    3401 EUB             : 
    3402 LBC           0 :     fd = OpenTransientFile(path, O_RDONLY);
    3403 UIC           0 :     if (fd < 0)
    3404                 :     {
    3405               0 :         ereport(LOG,
    3406                 :                 (errcode_for_file_access(),
    3407 EUB             :                  errmsg("could not open file \"%s\": %m", path)));
    3408 UIC           0 :         return;
    3409                 :     }
    3410               0 :     if (syncfs(fd) < 0)
    3411 UBC           0 :         ereport(LOG,
    3412                 :                 (errcode_for_file_access(),
    3413                 :                  errmsg("could not synchronize file system for file \"%s\": %m", path)));
    3414               0 :     CloseTransientFile(fd);
    3415 EUB             : }
    3416                 : #endif
    3417                 : 
    3418                 : /*
    3419                 :  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
    3420                 :  * all potential filesystem, depending on recovery_init_sync_method setting.
    3421                 :  *
    3422                 :  * We fsync regular files and directories wherever they are, but we
    3423                 :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    3424                 :  * Other symlinks are presumed to point at files we're not responsible
    3425                 :  * for fsyncing, and might not have privileges to write at all.
    3426                 :  *
    3427                 :  * Errors are logged but not considered fatal; that's because this is used
    3428                 :  * only during database startup, to deal with the possibility that there are
    3429                 :  * issued-but-unsynced writes pending against the data directory.  We want to
    3430                 :  * ensure that such writes reach disk before anything that's done in the new
    3431                 :  * run.  However, aborting on error would result in failure to start for
    3432                 :  * harmless cases such as read-only files in the data directory, and that's
    3433                 :  * not good either.
    3434                 :  *
    3435                 :  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
    3436                 :  * rewriting all changes again during recovery.
    3437                 :  *
    3438                 :  * Note we assume we're chdir'd into PGDATA to begin with.
    3439                 :  */
    3440                 : void
    3441 GIC         131 : SyncDataDirectory(void)
    3442                 : {
    3443                 :     bool        xlog_is_symlink;
    3444                 : 
    3445                 :     /* We can skip this whole thing if fsync is disabled. */
    3446             131 :     if (!enableFsync)
    3447             131 :         return;
    3448                 : 
    3449                 :     /*
    3450                 :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    3451                 :      * because the first walkdir below will ignore it.
    3452                 :      */
    3453 LBC           0 :     xlog_is_symlink = false;
    3454                 : 
    3455                 :     {
    3456                 :         struct stat st;
    3457 ECB             : 
    3458 LBC           0 :         if (lstat("pg_wal", &st) < 0)
    3459 UIC           0 :             ereport(LOG,
    3460                 :                     (errcode_for_file_access(),
    3461                 :                      errmsg("could not stat file \"%s\": %m",
    3462                 :                             "pg_wal")));
    3463               0 :         else if (S_ISLNK(st.st_mode))
    3464 UBC           0 :             xlog_is_symlink = true;
    3465                 :     }
    3466 EUB             : 
    3467                 : #ifdef HAVE_SYNCFS
    3468 UIC           0 :     if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
    3469                 :     {
    3470 EUB             :         DIR        *dir;
    3471                 :         struct dirent *de;
    3472                 : 
    3473                 :         /*
    3474                 :          * On Linux, we don't have to open every single file one by one.  We
    3475                 :          * can use syncfs() to sync whole filesystems.  We only expect
    3476                 :          * filesystem boundaries to exist where we tolerate symlinks, namely
    3477                 :          * pg_wal and the tablespaces, so we call syncfs() for each of those
    3478                 :          * directories.
    3479                 :          */
    3480                 : 
    3481                 :         /* Prepare to report progress syncing the data directory via syncfs. */
    3482 UIC           0 :         begin_startup_progress_phase();
    3483                 : 
    3484                 :         /* Sync the top level pgdata directory. */
    3485               0 :         do_syncfs(".");
    3486                 :         /* If any tablespaces are configured, sync each of those. */
    3487               0 :         dir = AllocateDir("pg_tblspc");
    3488               0 :         while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
    3489 EUB             :         {
    3490                 :             char        path[MAXPGPATH];
    3491                 : 
    3492 UBC           0 :             if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
    3493 UIC           0 :                 continue;
    3494 EUB             : 
    3495 UBC           0 :             snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
    3496 UIC           0 :             do_syncfs(path);
    3497                 :         }
    3498               0 :         FreeDir(dir);
    3499 EUB             :         /* If pg_wal is a symlink, process that too. */
    3500 UBC           0 :         if (xlog_is_symlink)
    3501 UIC           0 :             do_syncfs("pg_wal");
    3502 UBC           0 :         return;
    3503 EUB             :     }
    3504                 : #endif                          /* !HAVE_SYNCFS */
    3505                 : 
    3506                 : #ifdef PG_FLUSH_DATA_WORKS
    3507                 :     /* Prepare to report progress of the pre-fsync phase. */
    3508 UBC           0 :     begin_startup_progress_phase();
    3509 EUB             : 
    3510                 :     /*
    3511                 :      * If possible, hint to the kernel that we're soon going to fsync the data
    3512                 :      * directory and its contents.  Errors in this step are even less
    3513                 :      * interesting than normal, so log them only at DEBUG1.
    3514                 :      */
    3515 UBC           0 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    3516 UIC           0 :     if (xlog_is_symlink)
    3517               0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    3518               0 :     walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
    3519                 : #endif
    3520                 : 
    3521                 :     /* Prepare to report progress syncing the data directory via fsync. */
    3522 UBC           0 :     begin_startup_progress_phase();
    3523 EUB             : 
    3524                 :     /*
    3525                 :      * Now we do the fsync()s in the same order.
    3526                 :      *
    3527                 :      * The main call ignores symlinks, so in addition to specially processing
    3528                 :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    3529                 :      * process_symlinks = true.  Note that if there are any plain directories
    3530                 :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    3531                 :      * so we don't worry about optimizing it.
    3532                 :      */
    3533 UIC           0 :     walkdir(".", datadir_fsync_fname, false, LOG);
    3534               0 :     if (xlog_is_symlink)
    3535               0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    3536               0 :     walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
    3537                 : }
    3538                 : 
    3539                 : /*
    3540 EUB             :  * walkdir: recursively walk a directory, applying the action to each
    3541                 :  * regular file and directory (including the named directory itself).
    3542                 :  *
    3543                 :  * If process_symlinks is true, the action and recursion are also applied
    3544                 :  * to regular files and directories that are pointed to by symlinks in the
    3545                 :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    3546                 :  * ignored in subdirectories, ie we intentionally don't pass down the
    3547                 :  * process_symlinks flag to recursive calls.
    3548                 :  *
    3549                 :  * Errors are reported at level elevel, which might be ERROR or less.
    3550                 :  *
    3551                 :  * See also walkdir in file_utils.c, which is a frontend version of this
    3552                 :  * logic.
    3553                 :  */
    3554                 : static void
    3555 GIC         162 : walkdir(const char *path,
    3556                 :         void (*action) (const char *fname, bool isdir, int elevel),
    3557                 :         bool process_symlinks,
    3558                 :         int elevel)
    3559                 : {
    3560                 :     DIR        *dir;
    3561                 :     struct dirent *de;
    3562 ECB             : 
    3563 GIC         162 :     dir = AllocateDir(path);
    3564                 : 
    3565            1886 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3566                 :     {
    3567                 :         char        subpath[MAXPGPATH * 2];
    3568                 : 
    3569            1724 :         CHECK_FOR_INTERRUPTS();
    3570 ECB             : 
    3571 GIC        1724 :         if (strcmp(de->d_name, ".") == 0 ||
    3572 CBC        1562 :             strcmp(de->d_name, "..") == 0)
    3573 GIC         324 :             continue;
    3574                 : 
    3575            1400 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3576 ECB             : 
    3577 GIC        1400 :         switch (get_dirent_type(subpath, de, process_symlinks, elevel))
    3578 ECB             :         {
    3579 CBC        1400 :             case PGFILETYPE_REG:
    3580            1400 :                 (*action) (subpath, false, elevel);
    3581 GIC        1400 :                 break;
    3582 LBC           0 :             case PGFILETYPE_DIR:
    3583 UIC           0 :                 walkdir(subpath, action, false, elevel);
    3584 LBC           0 :                 break;
    3585 UIC           0 :             default:
    3586 ECB             : 
    3587                 :                 /*
    3588                 :                  * Errors are already reported directly by get_dirent_type(),
    3589 EUB             :                  * and any remaining symlinks and unknown file types are
    3590                 :                  * ignored.
    3591                 :                  */
    3592 UBC           0 :                 break;
    3593                 :         }
    3594                 :     }
    3595                 : 
    3596 GIC         162 :     FreeDir(dir);               /* we ignore any error here */
    3597                 : 
    3598                 :     /*
    3599 EUB             :      * It's important to fsync the destination directory itself as individual
    3600                 :      * file fsyncs don't guarantee that the directory entry for the file is
    3601                 :      * synced.  However, skip this if AllocateDir failed; the action function
    3602                 :      * might not be robust against that.
    3603 ECB             :      */
    3604 GIC         162 :     if (dir)
    3605             162 :         (*action) (path, true, elevel);
    3606             162 : }
    3607                 : 
    3608                 : 
    3609                 : /*
    3610                 :  * Hint to the OS that it should get ready to fsync() this file.
    3611 ECB             :  *
    3612                 :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3613                 :  * caller-specified level.
    3614                 :  */
    3615                 : #ifdef PG_FLUSH_DATA_WORKS
    3616                 : 
    3617                 : static void
    3618 UIC           0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3619                 : {
    3620                 :     int         fd;
    3621                 : 
    3622                 :     /* Don't try to flush directories, it'll likely just fail */
    3623               0 :     if (isdir)
    3624               0 :         return;
    3625 EUB             : 
    3626 UIC           0 :     ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
    3627                 :                              fname);
    3628                 : 
    3629               0 :     fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
    3630 EUB             : 
    3631 UBC           0 :     if (fd < 0)
    3632                 :     {
    3633               0 :         if (errno == EACCES)
    3634 UIC           0 :             return;
    3635               0 :         ereport(elevel,
    3636 EUB             :                 (errcode_for_file_access(),
    3637                 :                  errmsg("could not open file \"%s\": %m", fname)));
    3638 UBC           0 :         return;
    3639                 :     }
    3640 EUB             : 
    3641                 :     /*
    3642                 :      * pg_flush_data() ignores errors, which is ok because this is only a
    3643                 :      * hint.
    3644                 :      */
    3645 UBC           0 :     pg_flush_data(fd, 0, 0);
    3646                 : 
    3647 UIC           0 :     if (CloseTransientFile(fd) != 0)
    3648               0 :         ereport(elevel,
    3649                 :                 (errcode_for_file_access(),
    3650                 :                  errmsg("could not close file \"%s\": %m", fname)));
    3651                 : }
    3652 EUB             : 
    3653                 : #endif                          /* PG_FLUSH_DATA_WORKS */
    3654                 : 
    3655                 : static void
    3656 UIC           0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3657                 : {
    3658               0 :     ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
    3659                 :                              fname);
    3660                 : 
    3661                 :     /*
    3662                 :      * We want to silently ignoring errors about unreadable files.  Pass that
    3663 EUB             :      * desire on to fsync_fname_ext().
    3664                 :      */
    3665 UBC           0 :     fsync_fname_ext(fname, isdir, true, elevel);
    3666 UIC           0 : }
    3667                 : 
    3668                 : static void
    3669 GIC        1562 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
    3670                 : {
    3671            1562 :     if (isdir)
    3672 EUB             :     {
    3673 GBC         162 :         if (rmdir(fname) != 0 && errno != ENOENT)
    3674 UIC           0 :             ereport(elevel,
    3675                 :                     (errcode_for_file_access(),
    3676 ECB             :                      errmsg("could not remove directory \"%s\": %m", fname)));
    3677                 :     }
    3678                 :     else
    3679                 :     {
    3680                 :         /* Use PathNameDeleteTemporaryFile to report filesize */
    3681 GBC        1400 :         PathNameDeleteTemporaryFile(fname, false);
    3682                 :     }
    3683 GIC        1562 : }
    3684                 : 
    3685                 : /*
    3686                 :  * fsync_fname_ext -- Try to fsync a file or directory
    3687                 :  *
    3688 ECB             :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3689                 :  * files. Logs other errors at a caller-specified level.
    3690                 :  *
    3691                 :  * Returns 0 if the operation succeeded, -1 otherwise.
    3692                 :  */
    3693                 : int
    3694 GIC       34108 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3695                 : {
    3696                 :     int         fd;
    3697                 :     int         flags;
    3698                 :     int         returncode;
    3699                 : 
    3700                 :     /*
    3701 ECB             :      * Some OSs require directories to be opened read-only whereas other
    3702                 :      * systems don't allow us to fsync files opened read-only; so we need both
    3703                 :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3704                 :      * not writable by our userid, but we assume that's OK.
    3705                 :      */
    3706 GIC       34108 :     flags = PG_BINARY;
    3707           34108 :     if (!isdir)
    3708           11203 :         flags |= O_RDWR;
    3709                 :     else
    3710           22905 :         flags |= O_RDONLY;
    3711                 : 
    3712           34108 :     fd = OpenTransientFile(fname, flags);
    3713 ECB             : 
    3714                 :     /*
    3715                 :      * Some OSs don't allow us to open directories at all (Windows returns
    3716                 :      * EACCES), just ignore the error in that case.  If desired also silently
    3717                 :      * ignoring errors about unreadable files. Log others.
    3718                 :      */
    3719 CBC       34108 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3720 UIC           0 :         return 0;
    3721 GIC       34108 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3722 UIC           0 :         return 0;
    3723 GIC       34108 :     else if (fd < 0)
    3724                 :     {
    3725 UIC           0 :         ereport(elevel,
    3726 ECB             :                 (errcode_for_file_access(),
    3727 EUB             :                  errmsg("could not open file \"%s\": %m", fname)));
    3728 LBC           0 :         return -1;
    3729 EUB             :     }
    3730 ECB             : 
    3731 GIC       34108 :     returncode = pg_fsync(fd);
    3732 EUB             : 
    3733                 :     /*
    3734                 :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3735                 :      * those errors. Anything else needs to be logged.
    3736                 :      */
    3737 GIC       34108 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
    3738 ECB             :     {
    3739                 :         int         save_errno;
    3740                 : 
    3741                 :         /* close file upon error, might not be in transaction context */
    3742 UIC           0 :         save_errno = errno;
    3743               0 :         (void) CloseTransientFile(fd);
    3744 LBC           0 :         errno = save_errno;
    3745                 : 
    3746 UIC           0 :         ereport(elevel,
    3747                 :                 (errcode_for_file_access(),
    3748                 :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3749 UBC           0 :         return -1;
    3750 EUB             :     }
    3751                 : 
    3752 GIC       34108 :     if (CloseTransientFile(fd) != 0)
    3753 EUB             :     {
    3754 UIC           0 :         ereport(elevel,
    3755                 :                 (errcode_for_file_access(),
    3756 EUB             :                  errmsg("could not close file \"%s\": %m", fname)));
    3757 UIC           0 :         return -1;
    3758                 :     }
    3759 ECB             : 
    3760 GIC       34108 :     return 0;
    3761 EUB             : }
    3762                 : 
    3763                 : /*
    3764                 :  * fsync_parent_path -- fsync the parent path of a file or directory
    3765                 :  *
    3766                 :  * This is aimed at making file operations persistent on disk in case of
    3767 ECB             :  * an OS crash or power failure.
    3768                 :  */
    3769                 : static int
    3770 GIC        4809 : fsync_parent_path(const char *fname, int elevel)
    3771                 : {
    3772                 :     char        parentpath[MAXPGPATH];
    3773                 : 
    3774            4809 :     strlcpy(parentpath, fname, MAXPGPATH);
    3775            4809 :     get_parent_directory(parentpath);
    3776                 : 
    3777 ECB             :     /*
    3778                 :      * get_parent_directory() returns an empty string if the input argument is
    3779                 :      * just a file name (see comments in path.c), so handle that as being the
    3780                 :      * current directory.
    3781                 :      */
    3782 CBC        4809 :     if (strlen(parentpath) == 0)
    3783 GIC         151 :         strlcpy(parentpath, ".", MAXPGPATH);
    3784                 : 
    3785            4809 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3786 UIC           0 :         return -1;
    3787                 : 
    3788 GIC        4809 :     return 0;
    3789 ECB             : }
    3790                 : 
    3791                 : /*
    3792                 :  * Create a PostgreSQL data sub-directory
    3793 EUB             :  *
    3794                 :  * The data directory itself, and most of its sub-directories, are created at
    3795 ECB             :  * initdb time, but we do have some occasions when we create directories in
    3796                 :  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
    3797                 :  * make sure that those directories are created consistently.  Today, that means
    3798                 :  * making sure that the created directory has the correct permissions, which is
    3799                 :  * what pg_dir_create_mode tracks for us.
    3800                 :  *
    3801                 :  * Note that we also set the umask() based on what we understand the correct
    3802                 :  * permissions to be (see file_perm.c).
    3803                 :  *
    3804                 :  * For permissions other than the default, mkdir() can be used directly, but
    3805                 :  * be sure to consider carefully such cases -- a sub-directory with incorrect
    3806                 :  * permissions in a PostgreSQL data directory could cause backups and other
    3807                 :  * processes to fail.
    3808                 :  */
    3809                 : int
    3810 GIC        1638 : MakePGDirectory(const char *directoryName)
    3811                 : {
    3812            1638 :     return mkdir(directoryName, pg_dir_create_mode);
    3813                 : }
    3814                 : 
    3815                 : /*
    3816                 :  * Return the passed-in error level, or PANIC if data_sync_retry is off.
    3817 ECB             :  *
    3818                 :  * Failure to fsync any data file is cause for immediate panic, unless
    3819                 :  * data_sync_retry is enabled.  Data may have been written to the operating
    3820                 :  * system and removed from our buffer pool already, and if we are running on
    3821                 :  * an operating system that forgets dirty data on write-back failure, there
    3822                 :  * may be only one copy of the data remaining: in the WAL.  A later attempt to
    3823                 :  * fsync again might falsely report success.  Therefore we must not allow any
    3824                 :  * further checkpoints to be attempted.  data_sync_retry can in theory be
    3825                 :  * enabled on systems known not to drop dirty buffered data on write-back
    3826                 :  * failure (with the likely outcome that checkpoints will continue to fail
    3827                 :  * until the underlying problem is fixed).
    3828                 :  *
    3829                 :  * Any code that reports a failure from fsync() or related functions should
    3830                 :  * filter the error level with this function.
    3831                 :  */
    3832                 : int
    3833 GIC       19861 : data_sync_elevel(int elevel)
    3834                 : {
    3835           19861 :     return data_sync_retry ? elevel : PANIC;
    3836                 : }
    3837                 : 
    3838                 : bool
    3839 GNC        1859 : check_io_direct(char **newval, void **extra, GucSource source)
    3840                 : {
    3841            1859 :     bool        result = true;
    3842                 :     int         flags;
    3843 ECB             : 
    3844                 : #if PG_O_DIRECT == 0
    3845                 :     if (strcmp(*newval, "") != 0)
    3846                 :     {
    3847                 :         GUC_check_errdetail("io_direct is not supported on this platform.");
    3848                 :         result = false;
    3849                 :     }
    3850                 :     flags = 0;
    3851                 : #else
    3852                 :     List       *elemlist;
    3853                 :     ListCell   *l;
    3854                 :     char       *rawstring;
    3855                 : 
    3856                 :     /* Need a modifiable copy of string */
    3857 GNC        1859 :     rawstring = pstrdup(*newval);
    3858                 : 
    3859            1859 :     if (!SplitGUCList(rawstring, ',', &elemlist))
    3860                 :     {
    3861 UNC           0 :         GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
    3862                 :                             "io_direct");
    3863               0 :         pfree(rawstring);
    3864               0 :         list_free(elemlist);
    3865               0 :         return false;
    3866                 :     }
    3867                 : 
    3868 GNC        1859 :     flags = 0;
    3869            1865 :     foreach(l, elemlist)
    3870                 :     {
    3871               6 :         char       *item = (char *) lfirst(l);
    3872                 : 
    3873               6 :         if (pg_strcasecmp(item, "data") == 0)
    3874               2 :             flags |= IO_DIRECT_DATA;
    3875               4 :         else if (pg_strcasecmp(item, "wal") == 0)
    3876               2 :             flags |= IO_DIRECT_WAL;
    3877               2 :         else if (pg_strcasecmp(item, "wal_init") == 0)
    3878               2 :             flags |= IO_DIRECT_WAL_INIT;
    3879                 :         else
    3880                 :         {
    3881 UNC           0 :             GUC_check_errdetail("invalid option \"%s\"", item);
    3882               0 :             result = false;
    3883 LBC           0 :             break;
    3884 ECB             :         }
    3885                 :     }
    3886                 : 
    3887                 :     /*
    3888                 :      * It's possible to configure block sizes smaller than our assumed I/O
    3889                 :      * alignment size, which could result in invalid I/O requests.
    3890                 :      */
    3891                 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
    3892                 :     if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
    3893                 :     {
    3894                 :         GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
    3895                 :         result = false;
    3896                 :     }
    3897                 : #endif
    3898                 : #if BLCKSZ < PG_IO_ALIGN_SIZE
    3899                 :     if (result && (flags & IO_DIRECT_DATA))
    3900                 :     {
    3901                 :         GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
    3902                 :         result = false;
    3903                 :     }
    3904                 : #endif
    3905                 : 
    3906 GNC        1859 :     pfree(rawstring);
    3907            1859 :     list_free(elemlist);
    3908                 : #endif
    3909                 : 
    3910            1859 :     if (!result)
    3911 UNC           0 :         return result;
    3912                 : 
    3913                 :     /* Save the flags in *extra, for use by assign_io_direct */
    3914 GNC        1859 :     *extra = guc_malloc(ERROR, sizeof(int));
    3915            1859 :     *((int *) *extra) = flags;
    3916                 : 
    3917            1859 :     return result;
    3918                 : }
    3919                 : 
    3920                 : extern void
    3921            1859 : assign_io_direct(const char *newval, void *extra)
    3922                 : {
    3923            1859 :     int        *flags = (int *) extra;
    3924                 : 
    3925            1859 :     io_direct_flags = *flags;
    3926 GIC        1859 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a