LCOV - Differential Code Coverage HEAD vs 15 - src/backend/storage/smgr/md.c

LCOV - differential code coverage report

Current view:	top level - src/backend/storage/smgr - md.c (source / functions)		Coverage	Total	Hit	UNC	LBC	UIC	UBC	GBC	GIC	GNC	CBC	EUB	ECB	DUB	DCB
Current:	Differential Code Coverage HEAD vs 15	Lines:	75.4 %	406	306	12	15	66	7	14	153	76	63	75	213	4	15
Current Date:	2023-04-08 17:13:01	Functions:	96.9 %	32	31			1			22	9		1	28		3
Baseline:	15	Line coverage date bins:
Baseline Date:	2023-04-08 15:09:40	[..60] days:	82.4 %	51	42	8		1			1	40	1		2
Legend:	Lines: hit not hit	(120,180] days:	84.6 %	13	11			2		1	6	3	1		7
		(180,240] days:	83.3 %	18	15	3						15
		(240..) days:	73.5 %	324	238	1	15	63	7	13	146	18	61	39	189
		Function coverage date bins:
		[..60] days:	100.0 %	2	2							2
		(120,180] days:	0.0 %	1	0										1
		(180,240] days:	100.0 %	4	4							4
		(240..) days:	50.0 %	50	25			1			22	3			24

 Age         Owner                  TLA  Line data    Source code

                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * md.c
                                  4                 :  *    This code manages relations that reside on magnetic disk.
                                  5                 :  *
                                  6                 :  * Or at least, that was what the Berkeley folk had in mind when they named
                                  7                 :  * this file.  In reality, what this code provides is an interface from
                                  8                 :  * the smgr API to Unix-like filesystem APIs, so it will work with any type
                                  9                 :  * of device for which the operating system provides filesystem support.
                                 10                 :  * It doesn't matter whether the bits are on spinning rust or some other
                                 11                 :  * storage technology.
                                 12                 :  *
                                 13                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                 14                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                 15                 :  *
                                 16                 :  *
                                 17                 :  * IDENTIFICATION
                                 18                 :  *    src/backend/storage/smgr/md.c
                                 19                 :  *
                                 20                 :  *-------------------------------------------------------------------------
                                 21                 :  */
                                 22                 : #include "postgres.h"
                                 23                 : 
                                 24                 : #include <unistd.h>
                                 25                 : #include <fcntl.h>
                                 26                 : #include <sys/file.h>
                                 27                 : 
                                 28                 : #include "access/xlog.h"
                                 29                 : #include "access/xlogutils.h"
                                 30                 : #include "commands/tablespace.h"
                                 31                 : #include "miscadmin.h"
                                 32                 : #include "pg_trace.h"
                                 33                 : #include "pgstat.h"
                                 34                 : #include "postmaster/bgwriter.h"
                                 35                 : #include "storage/bufmgr.h"
                                 36                 : #include "storage/fd.h"
                                 37                 : #include "storage/md.h"
                                 38                 : #include "storage/relfilelocator.h"
                                 39                 : #include "storage/smgr.h"
                                 40                 : #include "storage/sync.h"
                                 41                 : #include "utils/hsearch.h"
                                 42                 : #include "utils/memutils.h"
                                 43                 : 
                                 44                 : /*
                                 45                 :  *  The magnetic disk storage manager keeps track of open file
                                 46                 :  *  descriptors in its own descriptor pool.  This is done to make it
                                 47                 :  *  easier to support relations that are larger than the operating
                                 48                 :  *  system's file size limit (often 2GBytes).  In order to do that,
                                 49                 :  *  we break relations up into "segment" files that are each shorter than
                                 50                 :  *  the OS file size limit.  The segment size is set by the RELSEG_SIZE
                                 51                 :  *  configuration constant in pg_config.h.
                                 52                 :  *
                                 53                 :  *  On disk, a relation must consist of consecutively numbered segment
                                 54                 :  *  files in the pattern
                                 55                 :  *      -- Zero or more full segments of exactly RELSEG_SIZE blocks each
                                 56                 :  *      -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
                                 57                 :  *      -- Optionally, any number of inactive segments of size 0 blocks.
                                 58                 :  *  The full and partial segments are collectively the "active" segments.
                                 59                 :  *  Inactive segments are those that once contained data but are currently
                                 60                 :  *  not needed because of an mdtruncate() operation.  The reason for leaving
                                 61                 :  *  them present at size zero, rather than unlinking them, is that other
                                 62                 :  *  backends and/or the checkpointer might be holding open file references to
                                 63                 :  *  such segments.  If the relation expands again after mdtruncate(), such
                                 64                 :  *  that a deactivated segment becomes active again, it is important that
                                 65                 :  *  such file references still be valid --- else data might get written
                                 66                 :  *  out to an unlinked old copy of a segment file that will eventually
                                 67                 :  *  disappear.
                                 68                 :  *
                                 69                 :  *  File descriptors are stored in the per-fork md_seg_fds arrays inside
                                 70                 :  *  SMgrRelation. The length of these arrays is stored in md_num_open_segs.
                                 71                 :  *  Note that a fork's md_num_open_segs having a specific value does not
                                 72                 :  *  necessarily mean the relation doesn't have additional segments; we may
                                 73                 :  *  just not have opened the next segment yet.  (We could not have "all
                                 74                 :  *  segments are in the array" as an invariant anyway, since another backend
                                 75                 :  *  could extend the relation while we aren't looking.)  We do not have
                                 76                 :  *  entries for inactive segments, however; as soon as we find a partial
                                 77                 :  *  segment, we assume that any subsequent segments are inactive.
                                 78                 :  *
                                 79                 :  *  The entire MdfdVec array is palloc'd in the MdCxt memory context.
                                 80                 :  */
                                 81                 : 
                                 82                 : typedef struct _MdfdVec
                                 83                 : {
                                 84                 :     File        mdfd_vfd;       /* fd number in fd.c's pool */
                                 85                 :     BlockNumber mdfd_segno;     /* segment number, from 0 */
                                 86                 : } MdfdVec;
                                 87                 : 
                                 88                 : static MemoryContext MdCxt;     /* context for all MdfdVec objects */
                                 89                 : 
                                 90                 : 
                                 91                 : /* Populate a file tag describing an md.c segment file. */
                                 92                 : #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
                                 93                 : ( \
                                 94                 :     memset(&(a), 0, sizeof(FileTag)), \
                                 95                 :     (a).handler = SYNC_HANDLER_MD, \
                                 96                 :     (a).rlocator = (xx_rlocator), \
                                 97                 :     (a).forknum = (xx_forknum), \
                                 98                 :     (a).segno = (xx_segno) \
                                 99                 : )
                                100                 : 
                                101                 : 
                                102                 : /*** behavior for mdopen & _mdfd_getseg ***/
                                103                 : /* ereport if segment not present */
                                104                 : #define EXTENSION_FAIL              (1 << 0)
                                105                 : /* return NULL if segment not present */
                                106                 : #define EXTENSION_RETURN_NULL       (1 << 1)
                                107                 : /* create new segments as needed */
                                108                 : #define EXTENSION_CREATE            (1 << 2)
                                109                 : /* create new segments if needed during recovery */
                                110                 : #define EXTENSION_CREATE_RECOVERY   (1 << 3)
                                111                 : /*
                                112                 :  * Allow opening segments which are preceded by segments smaller than
                                113                 :  * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks
                                114                 :  * mdnblocks() and related functionality henceforth - which currently is ok,
                                115                 :  * because this is only required in the checkpointer which never uses
                                116                 :  * mdnblocks().
                                117                 :  */
                                118                 : #define EXTENSION_DONT_CHECK_SIZE   (1 << 4)
                                119                 : /* don't try to open a segment, if not already open */
                                120                 : #define EXTENSION_DONT_OPEN         (1 << 5)
                                121                 : 
                                122                 : 
                                123                 : /* local routines */
                                124                 : static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                125                 :                          bool isRedo);
                                126                 : static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
                                127                 : static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
                                128                 :                                    MdfdVec *seg);
                                129                 : static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                130                 :                                     BlockNumber segno);
                                131                 : static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                132                 :                                     BlockNumber segno);
                                133                 : static void _fdvec_resize(SMgrRelation reln,
                                134                 :                           ForkNumber forknum,
                                135                 :                           int nseg);
                                136                 : static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
                                137                 :                            BlockNumber segno);
                                138                 : static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
                                139                 :                               BlockNumber segno, int oflags);
                                140                 : static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
                                141                 :                              BlockNumber blkno, bool skipFsync, int behavior);
                                142                 : static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
                                143                 :                               MdfdVec *seg);
                                144                 : 
                                145                 : static inline int
    1 tmunro                    146 GNC     1739246 : _mdfd_open_flags(void)
                                147                 : {
                                148         1739246 :     int         flags = O_RDWR | PG_BINARY;
                                149                 : 
                                150         1739246 :     if (io_direct_flags & IO_DIRECT_DATA)
                                151             304 :         flags |= PG_O_DIRECT;
                                152                 : 
                                153         1739246 :     return flags;
                                154                 : }
                                155                 : 
 9770 scrappy                   156 ECB             : /*
                                157                 :  *  mdinit() -- Initialize private state for magnetic disk storage manager.
                                158                 :  */
                                159                 : void
 7956 tgl                       160 CBC       13296 : mdinit(void)
 9770 scrappy                   161 ECB             : {
 8320 tgl                       162 GIC       13296 :     MdCxt = AllocSetContextCreate(TopMemoryContext,
 8320 tgl                       163 ECB             :                                   "MdSmgr",
                                164                 :                                   ALLOCSET_DEFAULT_SIZES);
 5036 heikki.linnakangas        165 GIC       13296 : }
                                166                 : 
                                167                 : /*
                                168                 :  *  mdexists() -- Does the physical file exist?
                                169                 :  *
 5354 heikki.linnakangas        170 ECB             :  * Note: this will return true for lingering files, with pending deletions
                                171                 :  */
                                172                 : bool
  202 pg                        173 GNC     1061928 : mdexists(SMgrRelation reln, ForkNumber forknum)
                                174                 : {
 5354 heikki.linnakangas        175 ECB             :     /*
                                176                 :      * Close it first, to ensure that we notice if the fork has been unlinked
                                177                 :      * since we opened it.  As an optimization, we can skip that in recovery,
                                178                 :      * which already closes relations when dropping them.
                                179                 :      */
  367 tmunro                    180 GIC     1061928 :     if (!InRecovery)
  202 pg                        181 GNC      577827 :         mdclose(reln, forknum);
                                182                 : 
                                183         1061928 :     return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
                                184                 : }
                                185                 : 
                                186                 : /*
                                187                 :  *  mdcreate() -- Create a new relation on magnetic disk.
                                188                 :  *
                                189                 :  * If isRedo is true, it's okay for the relation to exist already.
 6998 tgl                       190 ECB             :  */
 5940                           191                 : void
  202 pg                        192 GNC     2721301 : mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 9770 scrappy                   193 ECB             : {
                                194                 :     MdfdVec    *mdfd;
                                195                 :     char       *path;
                                196                 :     File        fd;
                                197                 : 
  202 pg                        198 GNC     2721301 :     if (isRedo && reln->md_num_open_segs[forknum] > 0)
 5940 tgl                       199 GIC     2502428 :         return;                 /* created and opened already... */
                                200                 : 
  202 pg                        201 GNC      218873 :     Assert(reln->md_num_open_segs[forknum] == 0);
 8329 tgl                       202 ECB             : 
                                203                 :     /*
                                204                 :      * We may be using the target table space for the first time in this
                                205                 :      * database, so create a per-database subdirectory if needed.
                                206                 :      *
                                207                 :      * XXX this is a fairly ugly violation of module layering, but this seems
 1362 tmunro                    208                 :      * to be the best place to put the check.  Maybe TablespaceCreateDbspace
                                209                 :      * should be here and not in commands/tablespace.c?  But that would imply
                                210                 :      * importing a lot of stuff that smgr.c oughtn't know, either.
                                211                 :      */
  277 rhaas                     212 GNC      218873 :     TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
                                213                 :                             reln->smgr_rlocator.locator.dbOid,
                                214                 :                             isRedo);
                                215                 : 
  202 pg                        216          218873 :     path = relpath(reln->smgr_rlocator, forknum);
                                217                 : 
    1 tmunro                    218          218873 :     fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
                                219                 : 
 9345 bruce                     220 GIC      218873 :     if (fd < 0)
                                221                 :     {
 8053 bruce                     222 CBC        3824 :         int         save_errno = errno;
                                223                 : 
 1532 akapila                   224 GIC        3824 :         if (isRedo)
    1 tmunro                    225 GNC        3824 :             fd = PathNameOpenFile(path, _mdfd_open_flags());
 9345 bruce                     226 CBC        3824 :         if (fd < 0)
                                227                 :         {
 5940 tgl                       228 ECB             :             /* be sure to report the error reported by create, not open */
 8329 tgl                       229 UIC           0 :             errno = save_errno;
 5940 tgl                       230 LBC           0 :             ereport(ERROR,
                                231                 :                     (errcode_for_file_access(),
 4995 heikki.linnakangas        232 ECB             :                      errmsg("could not create file \"%s\": %m", path)));
                                233                 :         }
 9345 bruce                     234                 :     }
 8187 tgl                       235                 : 
 8187 tgl                       236 CBC      218873 :     pfree(path);
                                237                 : 
  202 pg                        238 GNC      218873 :     _fdvec_resize(reln, forknum, 1);
                                239          218873 :     mdfd = &reln->md_seg_fds[forknum][0];
 2404 andres                    240 GBC      218873 :     mdfd->mdfd_vfd = fd;
 2404 andres                    241 GIC      218873 :     mdfd->mdfd_segno = 0;
                                242                 : }
                                243                 : 
                                244                 : /*
                                245                 :  *  mdunlink() -- Unlink a relation.
 6998 tgl                       246 ECB             :  *
                                247                 :  * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
                                248                 :  * there won't be an SMgrRelation hashtable entry anymore.
                                249                 :  *
                                250                 :  * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
 3916                           251                 :  * to delete all forks.
                                252                 :  *
                                253                 :  * For regular relations, we don't unlink the first segment file of the rel,
                                254                 :  * but just truncate it to zero length, and record a request to unlink it after
                                255                 :  * the next checkpoint.  Additional segments can be unlinked immediately,
                                256                 :  * however.  Leaving the empty file in place prevents that relfilenumber
                                257                 :  * from being reused.  The scenario this protects us from is:
                                258                 :  * 1. We delete a relation (and commit, and actually remove its file).
                                259                 :  * 2. We create a new relation, which by chance gets the same relfilenumber as
                                260                 :  *    the just-deleted one (OIDs must've wrapped around for that to happen).
                                261                 :  * 3. We crash before another checkpoint occurs.
                                262                 :  * During replay, we would delete the file and then recreate it, which is fine
                                263                 :  * if the contents of the file were repopulated by subsequent WAL entries.
                                264                 :  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
                                265                 :  * file after populating it (as we do at wal_level=minimal), the contents of
                                266                 :  * the file would be lost forever.  By leaving the empty file until after the
                                267                 :  * next checkpoint, we prevent reassignment of the relfilenumber until it's
                                268                 :  * safe, because relfilenumber assignment skips over any existing file.
                                269                 :  *
                                270                 :  * Additional segments, if any, are truncated and then unlinked.  The reason
                                271                 :  * for truncating is that other backends may still hold open FDs for these at
                                272                 :  * the smgr level, so that the kernel can't remove the file yet.  We want to
                                273                 :  * reclaim the disk space right away despite that.
                                274                 :  *
                                275                 :  * We do not need to go through this dance for temp relations, though, because
                                276                 :  * we never make WAL entries for temp rels, and so a temp rel poses no threat
                                277                 :  * to the health of a regular rel that has taken over its relfilenumber.
                                278                 :  * The fact that temp rels and regular rels have different file naming
                                279                 :  * patterns provides additional safety.  Other backends shouldn't have open
                                280                 :  * FDs for them, either.
                                281                 :  *
                                282                 :  * We also don't do it while performing a binary upgrade.  There is no reuse
                                283                 :  * hazard in that case, since after a crash or even a simple ERROR, the
                                284                 :  * upgrade fails and the whole cluster must be recreated from scratch.
                                285                 :  * Furthermore, it is important to remove the files from disk immediately,
                                286                 :  * because we may be about to reuse the same relfilenumber.
                                287                 :  *
                                288                 :  * All the above applies only to the relation's main fork; other forks can
                                289                 :  * just be removed immediately, since they are not needed to prevent the
                                290                 :  * relfilenumber from being recycled.  Also, we do not carefully
                                291                 :  * track whether other forks have been created or not, but just attempt to
                                292                 :  * unlink them unconditionally; so we should never complain about ENOENT.
                                293                 :  *
                                294                 :  * If isRedo is true, it's unsurprising for the relation to be already gone.
                                295                 :  * Also, we should remove the file immediately instead of queuing a request
                                296                 :  * for later, since during redo there's no possibility of creating a
                                297                 :  * conflicting relation.
                                298                 :  *
                                299                 :  * Note: we currently just never warn about ENOENT at all.  We could warn in
                                300                 :  * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
                                301                 :  *
                                302                 :  * Note: any failure should be reported as WARNING not ERROR, because
                                303                 :  * we are usually not in a transaction anymore when this is called.
                                304                 :  */
                                305                 : void
  202 pg                        306 GNC      149528 : mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
                                307                 : {
                                308                 :     /* Now do the per-fork work */
                                309          149528 :     if (forknum == InvalidForkNumber)
                                310                 :     {
  202 pg                        311 UNC           0 :         for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                                312               0 :             mdunlinkfork(rlocator, forknum, isRedo);
                                313                 :     }
                                314                 :     else
  202 pg                        315 GNC      149528 :         mdunlinkfork(rlocator, forknum, isRedo);
 3916 tgl                       316 CBC      149528 : }
                                317                 : 
                                318                 : /*
  859 tmunro                    319 ECB             :  * Truncate a file to release disk space.
                                320                 :  */
  859 tmunro                    321 EUB             : static int
  859 tmunro                    322 GBC      174303 : do_truncate(const char *path)
                                323                 : {
                                324                 :     int         save_errno;
  859 tmunro                    325 ECB             :     int         ret;
                                326                 : 
  859 tmunro                    327 GIC      174303 :     ret = pg_truncate(path, 0);
                                328                 : 
                                329                 :     /* Log a warning here to avoid repetition in callers. */
                                330          174303 :     if (ret < 0 && errno != ENOENT)
                                331                 :     {
  859 tmunro                    332 LBC           0 :         save_errno = errno;
  859 tmunro                    333 UIC           0 :         ereport(WARNING,
                                334                 :                 (errcode_for_file_access(),
                                335                 :                  errmsg("could not truncate file \"%s\": %m", path)));
                                336               0 :         errno = save_errno;
  859 tmunro                    337 ECB             :     }
                                338                 : 
  859 tmunro                    339 GIC      174303 :     return ret;
  859 tmunro                    340 ECB             : }
                                341                 : 
 3916 tgl                       342 EUB             : static void
  202 pg                        343 GNC      149528 : mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
                                344                 : {
                                345                 :     char       *path;
 3916 tgl                       346 EUB             :     int         ret;
                                347                 :     int         save_errno;
                                348                 : 
  202 pg                        349 GNC      149528 :     path = relpath(rlocator, forknum);
                                350                 : 
                                351                 :     /*
                                352                 :      * Truncate and then unlink the first segment, or just register a request
  151 tgl                       353 ECB             :      * to unlink it later, as described in the comments for mdunlink().
                                354                 :      */
  151 tgl                       355 GNC      149528 :     if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
                                356           31279 :         RelFileLocatorBackendIsTemp(rlocator))
                                357                 :     {
  277 rhaas                     358          121057 :         if (!RelFileLocatorBackendIsTemp(rlocator))
  859 tmunro                    359 ECB             :         {
                                360                 :             /* Prevent other backends' fds from holding on to the disk space */
  859 tmunro                    361 GIC      109825 :             ret = do_truncate(path);
                                362                 : 
                                363                 :             /* Forget any pending sync requests for the first segment */
  153 tgl                       364          109825 :             save_errno = errno;
  202 pg                        365 GNC      109825 :             register_forget_request(rlocator, forknum, 0 /* first seg */ );
  153 tgl                       366 CBC      109825 :             errno = save_errno;
                                367                 :         }
  859 tmunro                    368 ECB             :         else
  859 tmunro                    369 GIC       11232 :             ret = 0;
                                370                 : 
  859 tmunro                    371 ECB             :         /* Next unlink the file, unless it was already found to be missing */
  151 tgl                       372 GIC      121057 :         if (ret >= 0 || errno != ENOENT)
                                373                 :         {
  859 tmunro                    374 CBC       18772 :             ret = unlink(path);
                                375           18772 :             if (ret < 0 && errno != ENOENT)
  151 tgl                       376 ECB             :             {
  151 tgl                       377 UIC           0 :                 save_errno = errno;
  859 tmunro                    378               0 :                 ereport(WARNING,
  859 tmunro                    379 ECB             :                         (errcode_for_file_access(),
                                380                 :                          errmsg("could not remove file \"%s\": %m", path)));
  151 tgl                       381 UIC           0 :                 errno = save_errno;
  151 tgl                       382 ECB             :             }
                                383                 :         }
 4995 heikki.linnakangas        384                 :     }
 5624 tgl                       385                 :     else
                                386                 :     {
  859 tmunro                    387 EUB             :         /* Prevent other backends' fds from holding on to the disk space */
  859 tmunro                    388 GBC       28471 :         ret = do_truncate(path);
                                389                 : 
                                390                 :         /* Register request to unlink first segment later */
  151 tgl                       391           28471 :         save_errno = errno;
  151 tgl                       392 GNC       28471 :         register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
  151 tgl                       393 GIC       28471 :         errno = save_errno;
                                394                 :     }
                                395                 : 
                                396                 :     /*
                                397                 :      * Delete any additional segments.
  151 tgl                       398 ECB             :      *
                                399                 :      * Note that because we loop until getting ENOENT, we will correctly
                                400                 :      * remove all inactive segments as well as active ones.  Ideally we'd
                                401                 :      * continue the loop until getting exactly that errno, but that risks an
                                402                 :      * infinite loop if the problem is directory-wide (for instance, if we
                                403                 :      * suddenly can't read the data directory itself).  We compromise by
                                404                 :      * continuing after a non-ENOENT truncate error, but stopping after any
                                405                 :      * unlink error.  If there is indeed a directory-wide problem, additional
                                406                 :      * unlink attempts wouldn't work anyway.
                                407                 :      */
  151 tgl                       408 GIC      149528 :     if (ret >= 0 || errno != ENOENT)
                                409                 :     {
 8187                           410           38946 :         char       *segpath = (char *) palloc(strlen(path) + 12);
                                411                 :         BlockNumber segno;
                                412                 : 
  151                           413           38946 :         for (segno = 1;; segno++)
                                414                 :         {
                                415           38946 :             sprintf(segpath, "%s.%u", path, segno);
                                416                 : 
  277 rhaas                     417 GNC       38946 :             if (!RelFileLocatorBackendIsTemp(rlocator))
  859 tmunro                    418 ECB             :             {
                                419                 :                 /*
                                420                 :                  * Prevent other backends' fds from holding on to the disk
                                421                 :                  * space.  We're done if we see ENOENT, though.
                                422                 :                  */
  859 tmunro                    423 CBC       36007 :                 if (do_truncate(segpath) < 0 && errno == ENOENT)
  859 tmunro                    424 GIC       36007 :                     break;
  859 tmunro                    425 ECB             : 
                                426                 :                 /*
                                427                 :                  * Forget any pending sync requests for this segment before we
                                428                 :                  * try to unlink.
                                429                 :                  */
  202 pg                        430 UNC           0 :                 register_forget_request(rlocator, forknum, segno);
                                431                 :             }
                                432                 : 
 8187 tgl                       433 CBC        2939 :             if (unlink(segpath) < 0)
 8187 tgl                       434 ECB             :             {
                                435                 :                 /* ENOENT is expected after the last segment... */
 8187 tgl                       436 GIC        2939 :                 if (errno != ENOENT)
 5940 tgl                       437 UIC           0 :                     ereport(WARNING,
                                438                 :                             (errcode_for_file_access(),
                                439                 :                              errmsg("could not remove file \"%s\": %m", segpath)));
 8187 tgl                       440 GBC        2939 :                 break;
                                441                 :             }
                                442                 :         }
 8187 tgl                       443 CBC       38946 :         pfree(segpath);
                                444                 :     }
                                445                 : 
                                446          149528 :     pfree(path);
 9770 scrappy                   447 GBC      149528 : }
                                448                 : 
                                449                 : /*
 9345 bruce                     450 ECB             :  *  mdextend() -- Add a block to the specified relation.
                                451                 :  *
                                452                 :  *      The semantics are nearly the same as mdwrite(): write at the
 5940 tgl                       453                 :  *      specified position.  However, this is to be used for the case of
                                454                 :  *      extending a relation (i.e., blocknum is at or beyond the current
                                455                 :  *      EOF).  Note that we assume writing a block beyond current EOF
                                456                 :  *      causes intervening file space to become filled with zeroes.
 9770 scrappy                   457                 :  */
                                458                 : void
 5354 heikki.linnakangas        459 GIC      177935 : mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                460                 :          const void *buffer, bool skipFsync)
                                461                 : {
                                462                 :     off_t       seekpos;
                                463                 :     int         nbytes;
                                464                 :     MdfdVec    *v;
                                465                 : 
                                466                 :     /* If this build supports direct I/O, the buffer must be I/O aligned. */
                                467                 :     if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
    1 tmunro                    468 GNC      177935 :         Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
                                469                 : 
                                470                 :     /* This assert is too expensive to have on normally ... */
                                471                 : #ifdef CHECK_WRITE_VS_EXTEND
                                472                 :     Assert(blocknum >= mdnblocks(reln, forknum));
 5940 tgl                       473 ECB             : #endif
                                474                 : 
                                475                 :     /*
                                476                 :      * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
                                477                 :      * more --- we mustn't create a block whose number actually is
                                478                 :      * InvalidBlockNumber.  (Note that this failure should be unreachable
                                479                 :      * because of upstream checks in bufmgr.c.)
                                480                 :      */
 5940 tgl                       481 GIC      177935 :     if (blocknum == InvalidBlockNumber)
 5940 tgl                       482 LBC           0 :         ereport(ERROR,
                                483                 :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                484                 :                  errmsg("cannot extend file \"%s\" beyond %u blocks",
                                485                 :                         relpath(reln->smgr_rlocator, forknum),
                                486                 :                         InvalidBlockNumber)));
                                487                 : 
 4622 rhaas                     488 GIC      177935 :     v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
                                489                 : 
 2118 tgl                       490          177935 :     seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                491                 : 
 5508                           492          177935 :     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
                                493                 : 
 1614 tmunro                    494          177935 :     if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
 8586 inoue                     495 ECB             :     {
 5940 tgl                       496 UBC           0 :         if (nbytes < 0)
 5940 tgl                       497 UIC           0 :             ereport(ERROR,
                                498                 :                     (errcode_for_file_access(),
                                499                 :                      errmsg("could not extend file \"%s\": %m",
                                500                 :                             FilePathName(v->mdfd_vfd)),
                                501                 :                      errhint("Check free disk space.")));
 5940 tgl                       502 ECB             :         /* short write: complain appropriately */
 5940 tgl                       503 UIC           0 :         ereport(ERROR,
 5940 tgl                       504 ECB             :                 (errcode(ERRCODE_DISK_FULL),
                                505                 :                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
 4995 heikki.linnakangas        506                 :                         FilePathName(v->mdfd_vfd),
                                507                 :                         nbytes, BLCKSZ, blocknum),
 5940 tgl                       508                 :                  errhint("Check free disk space.")));
                                509                 :     }
 9770 scrappy                   510 EUB             : 
 4622 rhaas                     511 GBC      177935 :     if (!skipFsync && !SmgrIsTemp(reln))
 5354 heikki.linnakangas        512 GIC          28 :         register_dirty_segment(reln, forknum, v);
                                513                 : 
                                514          177935 :     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 9770 scrappy                   515          177935 : }
                                516                 : 
                                517                 : /*
                                518                 :  *  mdzeroextend() -- Add new zeroed out blocks to the specified relation.
                                519                 :  *
                                520                 :  *      Similar to mdextend(), except the relation can be extended by multiple
                                521                 :  *      blocks at once and the added blocks will be filled with zeroes.
                                522                 :  */
                                523                 : void
    4 andres                    524 GNC      343785 : mdzeroextend(SMgrRelation reln, ForkNumber forknum,
                                525                 :              BlockNumber blocknum, int nblocks, bool skipFsync)
                                526                 : {
                                527                 :     MdfdVec    *v;
                                528          343785 :     BlockNumber curblocknum = blocknum;
                                529          343785 :     int         remblocks = nblocks;
                                530                 : 
                                531          343785 :     Assert(nblocks > 0);
                                532                 : 
                                533                 :     /* This assert is too expensive to have on normally ... */
                                534                 : #ifdef CHECK_WRITE_VS_EXTEND
                                535                 :     Assert(blocknum >= mdnblocks(reln, forknum));
                                536                 : #endif
                                537                 : 
                                538                 :     /*
                                539                 :      * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
                                540                 :      * more --- we mustn't create a block whose number actually is
                                541                 :      * InvalidBlockNumber or larger.
                                542                 :      */
                                543          343785 :     if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
    4 andres                    544 UNC           0 :         ereport(ERROR,
                                545                 :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                546                 :                  errmsg("cannot extend file \"%s\" beyond %u blocks",
                                547                 :                         relpath(reln->smgr_rlocator, forknum),
                                548                 :                         InvalidBlockNumber)));
                                549                 : 
    4 andres                    550 GNC      687570 :     while (remblocks > 0)
                                551                 :     {
                                552          343785 :         BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
                                553          343785 :         off_t       seekpos = (off_t) BLCKSZ * segstartblock;
                                554                 :         int         numblocks;
                                555                 : 
                                556          343785 :         if (segstartblock + remblocks > RELSEG_SIZE)
    4 andres                    557 UNC           0 :             numblocks = RELSEG_SIZE - segstartblock;
                                558                 :         else
    4 andres                    559 GNC      343785 :             numblocks = remblocks;
                                560                 : 
                                561          343785 :         v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
                                562                 : 
                                563          343785 :         Assert(segstartblock < RELSEG_SIZE);
                                564          343785 :         Assert(segstartblock + numblocks <= RELSEG_SIZE);
                                565                 : 
                                566                 :         /*
                                567                 :          * If available and useful, use posix_fallocate() (via FileAllocate())
                                568                 :          * to extend the relation. That's often more efficient than using
                                569                 :          * write(), as it commonly won't cause the kernel to allocate page
                                570                 :          * cache space for the extended pages.
                                571                 :          *
                                572                 :          * However, we don't use FileAllocate() for small extensions, as it
                                573                 :          * defeats delayed allocation on some filesystems. Not clear where
                                574                 :          * that decision should be made though? For now just use a cutoff of
                                575                 :          * 8, anything between 4 and 8 worked OK in some local testing.
                                576                 :          */
                                577          343785 :         if (numblocks > 8)
                                578                 :         {
                                579                 :             int         ret;
                                580                 : 
                                581             385 :             ret = FileFallocate(v->mdfd_vfd,
                                582                 :                                 seekpos, (off_t) BLCKSZ * numblocks,
                                583                 :                                 WAIT_EVENT_DATA_FILE_EXTEND);
                                584             385 :             if (ret != 0)
                                585                 :             {
    4 andres                    586 UNC           0 :                 ereport(ERROR,
                                587                 :                         errcode_for_file_access(),
                                588                 :                         errmsg("could not extend file \"%s\" with FileFallocate(): %m",
                                589                 :                                FilePathName(v->mdfd_vfd)),
                                590                 :                         errhint("Check free disk space."));
                                591                 :             }
                                592                 :         }
                                593                 :         else
                                594                 :         {
                                595                 :             int         ret;
                                596                 : 
                                597                 :             /*
                                598                 :              * Even if we don't want to use fallocate, we can still extend a
                                599                 :              * bit more efficiently than writing each 8kB block individually.
                                600                 :              * pg_pwrite_zeroes() (via FileZero()) uses
                                601                 :              * pg_pwritev_with_retry() to avoid multiple writes or needing a
                                602                 :              * zeroed buffer for the whole length of the extension.
                                603                 :              */
    4 andres                    604 GNC      343400 :             ret = FileZero(v->mdfd_vfd,
                                605                 :                            seekpos, (off_t) BLCKSZ * numblocks,
                                606                 :                            WAIT_EVENT_DATA_FILE_EXTEND);
                                607          343400 :             if (ret < 0)
    4 andres                    608 UNC           0 :                 ereport(ERROR,
                                609                 :                         errcode_for_file_access(),
                                610                 :                         errmsg("could not extend file \"%s\": %m",
                                611                 :                                FilePathName(v->mdfd_vfd)),
                                612                 :                         errhint("Check free disk space."));
                                613                 :         }
                                614                 : 
    4 andres                    615 GNC      343785 :         if (!skipFsync && !SmgrIsTemp(reln))
                                616          332890 :             register_dirty_segment(reln, forknum, v);
                                617                 : 
                                618          343785 :         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
                                619                 : 
                                620          343785 :         remblocks -= numblocks;
                                621          343785 :         curblocknum += numblocks;
                                622                 :     }
                                623          343785 : }
                                624                 : 
 9770 scrappy                   625 EUB             : /*
                                626                 :  *  mdopenfork() -- Open one fork of the specified relation.
                                627                 :  *
                                628                 :  * Note we only open the first segment, when there are multiple segments.
                                629                 :  *
                                630                 :  * If first segment is not present, either ereport or return NULL according
                                631                 :  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
                                632                 :  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 5940 tgl                       633 ECB             :  * invent one out of whole cloth.
 9770 scrappy                   634                 :  */
                                635                 : static MdfdVec *
 1362 tmunro                    636 CBC     3993243 : mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 9770 scrappy                   637 ECB             : {
                                638                 :     MdfdVec    *mdfd;
                                639                 :     char       *path;
                                640                 :     File        fd;
                                641                 : 
                                642                 :     /* No work if already open */
 2404 andres                    643 GIC     3993243 :     if (reln->md_num_open_segs[forknum] > 0)
                                644         2536346 :         return &reln->md_seg_fds[forknum][0];
                                645                 : 
  277 rhaas                     646 GNC     1456897 :     path = relpath(reln->smgr_rlocator, forknum);
                                647                 : 
    1 tmunro                    648         1456897 :     fd = PathNameOpenFile(path, _mdfd_open_flags());
                                649                 : 
 9345 bruce                     650 CBC     1456897 :     if (fd < 0)
 8617 tgl                       651 ECB             :     {
 1532 akapila                   652 GIC      409266 :         if ((behavior & EXTENSION_RETURN_NULL) &&
 1532 akapila                   653 CBC      409245 :             FILE_POSSIBLY_DELETED(errno))
                                654                 :         {
 1532 akapila                   655 GIC      409245 :             pfree(path);
                                656          409245 :             return NULL;
                                657                 :         }
                                658              21 :         ereport(ERROR,
                                659                 :                 (errcode_for_file_access(),
                                660                 :                  errmsg("could not open file \"%s\": %m", path)));
                                661                 :     }
                                662                 : 
 8187 tgl                       663         1047631 :     pfree(path);
                                664                 : 
 2404 andres                    665 CBC     1047631 :     _fdvec_resize(reln, forknum, 1);
 2404 andres                    666 GBC     1047631 :     mdfd = &reln->md_seg_fds[forknum][0];
 6887 tgl                       667 GIC     1047631 :     mdfd->mdfd_vfd = fd;
                                668         1047631 :     mdfd->mdfd_segno = 0;
                                669                 : 
 5354 heikki.linnakangas        670         1047631 :     Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
                                671                 : 
 6887 tgl                       672 CBC     1047631 :     return mdfd;
                                673                 : }
 9770 scrappy                   674 ECB             : 
 1362 tmunro                    675                 : /*
                                676                 :  *  mdopen() -- Initialize newly-opened relation.
                                677                 :  */
                                678                 : void
 1362 tmunro                    679 GBC     1342539 : mdopen(SMgrRelation reln)
                                680                 : {
 1362 tmunro                    681 ECB             :     /* mark it not open */
 1362 tmunro                    682 GIC     6712695 :     for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 1362 tmunro                    683 CBC     5370156 :         reln->md_num_open_segs[forknum] = 0;
 1362 tmunro                    684 GIC     1342539 : }
 1362 tmunro                    685 ECB             : 
 9770 scrappy                   686                 : /*
                                687                 :  *  mdclose() -- Close the specified relation, if it isn't closed already.
                                688                 :  */
                                689                 : void
 5354 heikki.linnakangas        690 GIC     4185971 : mdclose(SMgrRelation reln, ForkNumber forknum)
                                691                 : {
 2404 andres                    692         4185971 :     int         nopensegs = reln->md_num_open_segs[forknum];
                                693                 : 
                                694                 :     /* No work if already closed */
                                695         4185971 :     if (nopensegs == 0)
 5940 tgl                       696         3212515 :         return;
                                697                 : 
                                698                 :     /* close segments starting from the end */
 2404 andres                    699 CBC     1946912 :     while (nopensegs > 0)
                                700                 :     {
 2404 andres                    701 GIC      973456 :         MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
                                702                 : 
 1185 noah                      703 CBC      973456 :         FileClose(v->mdfd_vfd);
 1185 noah                      704 GIC      973456 :         _fdvec_resize(reln, forknum, nopensegs - 1);
 2404 andres                    705          973456 :         nopensegs--;
 9345 bruce                     706 ECB             :     }
                                707                 : }
 9770 scrappy                   708 EUB             : 
                                709                 : /*
                                710                 :  *  mdprefetch() -- Initiate asynchronous read of the specified block of a relation
                                711                 :  */
                                712                 : bool
 5200 tgl                       713 GIC      212103 : mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
                                714                 : {
                                715                 : #ifdef USE_PREFETCH
                                716                 :     off_t       seekpos;
                                717                 :     MdfdVec    *v;
                                718                 : 
    1 tmunro                    719 GNC      212103 :     Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
                                720                 : 
 1096 tmunro                    721 GIC      212103 :     v = _mdfd_getseg(reln, forknum, blocknum, false,
                                722          212103 :                      InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
                                723          212103 :     if (v == NULL)
 1096 tmunro                    724 UIC           0 :         return false;
                                725                 : 
 2118 tgl                       726 GIC      212103 :     seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                727                 : 
 5200 tgl                       728 CBC      212103 :     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
                                729                 : 
 2213 rhaas                     730 GIC      212103 :     (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
 2118 tgl                       731 ECB             : #endif                          /* USE_PREFETCH */
 1096 tmunro                    732 EUB             : 
 1096 tmunro                    733 GIC      212103 :     return true;
                                734                 : }
                                735                 : 
                                736                 : /*
                                737                 :  * mdwriteback() -- Tell the kernel to write pages back to storage.
                                738                 :  *
 2606 andres                    739 ECB             :  * This accepts a range of blocks because flushing several pages at once is
                                740                 :  * considerably more efficient than doing so individually.
                                741                 :  */
                                742                 : void
 2552 tgl                       743 GIC      134299 : mdwriteback(SMgrRelation reln, ForkNumber forknum,
 2552 tgl                       744 ECB             :             BlockNumber blocknum, BlockNumber nblocks)
 2606 andres                    745                 : {
    1 tmunro                    746 GNC      134299 :     Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
                                747                 : 
                                748                 :     /*
 2606 andres                    749 ECB             :      * Issue flush requests in as few requests as possible; have to split at
                                750                 :      * segment boundaries though, since those are actually separate files.
                                751                 :      */
 2552 tgl                       752 GIC      268321 :     while (nblocks > 0)
                                753                 :     {
                                754          134299 :         BlockNumber nflush = nblocks;
                                755                 :         off_t       seekpos;
                                756                 :         MdfdVec    *v;
                                757                 :         int         segnum_start,
                                758                 :                     segnum_end;
                                759                 : 
 2531 andres                    760          134299 :         v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
                                761                 :                          EXTENSION_DONT_OPEN);
 2606 andres                    762 ECB             : 
                                763                 :         /*
                                764                 :          * We might be flushing buffers of already removed relations, that's
                                765                 :          * ok, just ignore that case.  If the segment file wasn't open already
                                766                 :          * (ie from a recent mdwrite()), then we don't want to re-open it, to
                                767                 :          * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
                                768                 :          * us with a descriptor to a file that is about to be unlinked.
                                769                 :          */
 2606 andres                    770 CBC      134299 :         if (!v)
 2606 andres                    771 GIC         277 :             return;
 2606 andres                    772 ECB             : 
                                773                 :         /* compute offset inside the current segment */
 2606 andres                    774 CBC      134022 :         segnum_start = blocknum / RELSEG_SIZE;
                                775                 : 
 2606 andres                    776 ECB             :         /* compute number of desired writes within the current segment */
 2606 andres                    777 GIC      134022 :         segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
 2606 andres                    778 CBC      134022 :         if (segnum_start != segnum_end)
 2606 andres                    779 LBC           0 :             nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
                                780                 : 
 2606 andres                    781 CBC      134022 :         Assert(nflush >= 1);
                                782          134022 :         Assert(nflush <= nblocks);
                                783                 : 
 2118 tgl                       784          134022 :         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                785                 : 
 2213 rhaas                     786 GIC      134022 :         FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
                                787                 : 
 2606 andres                    788          134022 :         nblocks -= nflush;
 2606 andres                    789 CBC      134022 :         blocknum += nflush;
                                790                 :     }
 2606 andres                    791 ECB             : }
 5200 tgl                       792                 : 
 9770 scrappy                   793                 : /*
 9345 bruce                     794                 :  *  mdread() -- Read the specified block from a relation.
                                795                 :  */
 5940 tgl                       796                 : void
 5354 heikki.linnakangas        797 GIC     1310557 : mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                798                 :        void *buffer)
                                799                 : {
                                800                 :     off_t       seekpos;
                                801                 :     int         nbytes;
                                802                 :     MdfdVec    *v;
                                803                 : 
                                804                 :     /* If this build supports direct I/O, the buffer must be I/O aligned. */
                                805                 :     if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
    1 tmunro                    806 GNC     1310557 :         Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
                                807                 : 
                                808                 :     TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
                                809                 :                                         reln->smgr_rlocator.locator.spcOid,
                                810                 :                                         reln->smgr_rlocator.locator.dbOid,
                                811                 :                                         reln->smgr_rlocator.locator.relNumber,
                                812                 :                                         reln->smgr_rlocator.backend);
 5226 bruce                     813 ECB             : 
 2531 andres                    814 CBC     1310557 :     v = _mdfd_getseg(reln, forknum, blocknum, false,
                                815                 :                      EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
                                816                 : 
 2118 tgl                       817 GIC     1310544 :     seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                818                 : 
 5508                           819         1310544 :     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 9770 scrappy                   820 ECB             : 
 1614 tmunro                    821 GIC     1310544 :     nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
 5226 bruce                     822 ECB             : 
                                823                 :     TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
                                824                 :                                        reln->smgr_rlocator.locator.spcOid,
                                825                 :                                        reln->smgr_rlocator.locator.dbOid,
                                826                 :                                        reln->smgr_rlocator.locator.relNumber,
                                827                 :                                        reln->smgr_rlocator.backend,
                                828                 :                                        nbytes,
 5142 tgl                       829                 :                                        BLCKSZ);
                                830                 : 
 5226 bruce                     831 CBC     1310544 :     if (nbytes != BLCKSZ)
                                832                 :     {
 5940 tgl                       833 LBC           0 :         if (nbytes < 0)
                                834               0 :             ereport(ERROR,
 5940 tgl                       835 ECB             :                     (errcode_for_file_access(),
                                836                 :                      errmsg("could not read block %u in file \"%s\": %m",
                                837                 :                             blocknum, FilePathName(v->mdfd_vfd))));
                                838                 : 
                                839                 :         /*
                                840                 :          * Short read: we are at or past EOF, or we read a partial block at
                                841                 :          * EOF.  Normally this is an error; upper levels should never try to
                                842                 :          * read a nonexistent block.  However, if zero_damaged_pages is ON or
 5624 bruce                     843                 :          * we are InRecovery, we should instead return zeroes without
                                844                 :          * complaining.  This allows, for example, the case of trying to
                                845                 :          * update a block that was later truncated away.
                                846                 :          */
 5940 tgl                       847 UIC           0 :         if (zero_damaged_pages || InRecovery)
 8586 inoue                     848               0 :             MemSet(buffer, 0, BLCKSZ);
 9345 bruce                     849 ECB             :         else
 5940 tgl                       850 UIC           0 :             ereport(ERROR,
 5940 tgl                       851 ECB             :                     (errcode(ERRCODE_DATA_CORRUPTED),
 4995 heikki.linnakangas        852                 :                      errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
                                853                 :                             blocknum, FilePathName(v->mdfd_vfd),
 5940 tgl                       854 EUB             :                             nbytes, BLCKSZ)));
                                855                 :     }
 9770 scrappy                   856 CBC     1310544 : }
                                857                 : 
 9770 scrappy                   858 ECB             : /*
                                859                 :  *  mdwrite() -- Write the supplied block at the appropriate location.
 5940 tgl                       860                 :  *
                                861                 :  *      This is to be used only for updating already-existing blocks of a
                                862                 :  *      relation (ie, those before the current EOF).  To extend a relation,
                                863                 :  *      use mdextend().
                                864                 :  */
                                865                 : void
 5354 heikki.linnakangas        866 GIC      748947 : mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                867                 :         const void *buffer, bool skipFsync)
                                868                 : {
                                869                 :     off_t       seekpos;
                                870                 :     int         nbytes;
                                871                 :     MdfdVec    *v;
                                872                 : 
                                873                 :     /* If this build supports direct I/O, the buffer must be I/O aligned. */
                                874                 :     if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
    1 tmunro                    875 GNC      748947 :         Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
                                876                 : 
 5940 tgl                       877 ECB             :     /* This assert is too expensive to have on normally ... */
                                878                 : #ifdef CHECK_WRITE_VS_EXTEND
                                879                 :     Assert(blocknum < mdnblocks(reln, forknum));
                                880                 : #endif
                                881                 : 
                                882                 :     TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
                                883                 :                                          reln->smgr_rlocator.locator.spcOid,
                                884                 :                                          reln->smgr_rlocator.locator.dbOid,
                                885                 :                                          reln->smgr_rlocator.locator.relNumber,
                                886                 :                                          reln->smgr_rlocator.backend);
                                887                 : 
 2531 andres                    888 CBC      748947 :     v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
                                889                 :                      EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
                                890                 : 
 2118 tgl                       891 GIC      748947 :     seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                892                 : 
 5508                           893          748947 :     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 9770 scrappy                   894 ECB             : 
 1614 tmunro                    895 GIC      748947 :     nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
                                896                 : 
                                897                 :     TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
                                898                 :                                         reln->smgr_rlocator.locator.spcOid,
                                899                 :                                         reln->smgr_rlocator.locator.dbOid,
                                900                 :                                         reln->smgr_rlocator.locator.relNumber,
                                901                 :                                         reln->smgr_rlocator.backend,
                                902                 :                                         nbytes,
                                903                 :                                         BLCKSZ);
 5226 bruce                     904 ECB             : 
 5226 bruce                     905 CBC      748947 :     if (nbytes != BLCKSZ)
                                906                 :     {
 5940 tgl                       907 UIC           0 :         if (nbytes < 0)
 5940 tgl                       908 LBC           0 :             ereport(ERROR,
                                909                 :                     (errcode_for_file_access(),
                                910                 :                      errmsg("could not write block %u in file \"%s\": %m",
 4995 heikki.linnakangas        911 ECB             :                             blocknum, FilePathName(v->mdfd_vfd))));
 5940 tgl                       912                 :         /* short write: complain appropriately */
 5940 tgl                       913 UBC           0 :         ereport(ERROR,
                                914                 :                 (errcode(ERRCODE_DISK_FULL),
 4995 heikki.linnakangas        915 ECB             :                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
 5940 tgl                       916                 :                         blocknum,
                                917                 :                         FilePathName(v->mdfd_vfd),
                                918                 :                         nbytes, BLCKSZ),
                                919                 :                  errhint("Check free disk space.")));
 6887                           920                 :     }
                                921                 : 
 4622 rhaas                     922 CBC      748947 :     if (!skipFsync && !SmgrIsTemp(reln))
 5354 heikki.linnakangas        923          725820 :         register_dirty_segment(reln, forknum, v);
 8400 tgl                       924 GIC      748947 : }
                                925                 : 
                                926                 : /*
                                927                 :  *  mdnblocks() -- Get the number of blocks stored in a relation.
                                928                 :  *
                                929                 :  *      Important side effect: all active segments of the relation are opened
                                930                 :  *      and added to the md_seg_fds array.  If this routine has not been
 8620 tgl                       931 ECB             :  *      called, then only segments up to the last one actually touched
                                932                 :  *      are present in the array.
                                933                 :  */
                                934                 : BlockNumber
 5354 heikki.linnakangas        935 GIC     2682691 : mdnblocks(SMgrRelation reln, ForkNumber forknum)
                                936                 : {
                                937                 :     MdfdVec    *v;
                                938                 :     BlockNumber nblocks;
                                939                 :     BlockNumber segno;
  948 bruce                     940 ECB             : 
  948 bruce                     941 GIC     2682691 :     mdopenfork(reln, forknum, EXTENSION_FAIL);
                                942                 : 
                                943                 :     /* mdopen has opened the first segment */
 2404 andres                    944         2682673 :     Assert(reln->md_num_open_segs[forknum] > 0);
                                945                 : 
                                946                 :     /*
                                947                 :      * Start from the last open segments, to avoid redundant seeks.  We have
 2404 andres                    948 ECB             :      * previously verified that these segments are exactly RELSEG_SIZE long,
                                949                 :      * and it's useless to recheck that each time.
                                950                 :      *
 5984 tgl                       951                 :      * NOTE: this assumption could only be wrong if another backend has
                                952                 :      * truncated the relation.  We rely on higher code levels to handle that
                                953                 :      * scenario by closing and re-opening the md fd, which is handled via
                                954                 :      * relcache flush.  (Since the checkpointer doesn't participate in
 2404 andres                    955                 :      * relcache flush, it could have segment entries for inactive segments;
                                956                 :      * that's OK because the checkpointer never needs to compute relation
                                957                 :      * size.)
                                958                 :      */
 2404 andres                    959 GIC     2682673 :     segno = reln->md_num_open_segs[forknum] - 1;
                                960         2682673 :     v = &reln->md_seg_fds[forknum][segno];
                                961                 : 
                                962                 :     for (;;)
                                963                 :     {
 5354 heikki.linnakangas        964         2682673 :         nblocks = _mdnblocks(reln, forknum, v);
 7956 tgl                       965 CBC     2682673 :         if (nblocks > ((BlockNumber) RELSEG_SIZE))
 7199 tgl                       966 UIC           0 :             elog(FATAL, "segment too big");
 7956 tgl                       967 GBC     2682673 :         if (nblocks < ((BlockNumber) RELSEG_SIZE))
                                968         2682673 :             return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
                                969                 : 
                                970                 :         /*
                                971                 :          * If segment is exactly RELSEG_SIZE, advance to next one.
                                972                 :          */
 8004 tgl                       973 UIC           0 :         segno++;
                                974                 : 
                                975                 :         /*
                                976                 :          * We used to pass O_CREAT here, but that has the disadvantage that it
                                977                 :          * might create a segment which has vanished through some operating
                                978                 :          * system misadventure.  In such a case, creating the segment here
                                979                 :          * undermines _mdfd_getseg's attempts to notice and report an error
                                980                 :          * upon access to a missing segment.
 2404 andres                    981 EUB             :          */
 2404 andres                    982 UBC           0 :         v = _mdfd_openseg(reln, forknum, segno, 0);
 2404 andres                    983 UIC           0 :         if (v == NULL)
 2404 andres                    984 UBC           0 :             return segno * ((BlockNumber) RELSEG_SIZE);
                                985                 :     }
                                986                 : }
                                987                 : 
                                988                 : /*
                                989                 :  *  mdtruncate() -- Truncate relation to specified number of blocks.
 9629 vadim4o                   990 ECB             :  */
                                991                 : void
 4622 rhaas                     992 GIC         663 : mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
                                993                 : {
                                994                 :     BlockNumber curnblk;
                                995                 :     BlockNumber priorblocks;
                                996                 :     int         curopensegs;
                                997                 : 
                                998                 :     /*
                                999                 :      * NOTE: mdnblocks makes sure we have opened all active segments, so that
 5624 bruce                    1000 ECB             :      * truncation loop will get them all!
                               1001                 :      */
 5354 heikki.linnakangas       1002 GIC         663 :     curnblk = mdnblocks(reln, forknum);
 7956 tgl                      1003             663 :     if (nblocks > curnblk)
                               1004                 :     {
                               1005                 :         /* Bogus request ... but no complaint if InRecovery */
 5940 tgl                      1006 UIC           0 :         if (InRecovery)
                               1007               0 :             return;
                               1008               0 :         ereport(ERROR,
 4995 heikki.linnakangas       1009 ECB             :                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
                               1010                 :                         relpath(reln->smgr_rlocator, forknum),
                               1011                 :                         nblocks, curnblk)));
                               1012                 :     }
 8620 tgl                      1013 GIC         663 :     if (nblocks == curnblk)
 5940                          1014             276 :         return;                 /* no work */
                               1015                 : 
                               1016                 :     /*
                               1017                 :      * Truncate segments, starting at the last one. Starting at the end makes
                               1018                 :      * managing the memory for the fd array easier, should there be errors.
                               1019                 :      */
 2404 andres                   1020             387 :     curopensegs = reln->md_num_open_segs[forknum];
                               1021             774 :     while (curopensegs > 0)
 8696 bruce                    1022 ECB             :     {
                               1023                 :         MdfdVec    *v;
                               1024                 : 
 2404 andres                   1025 CBC         387 :         priorblocks = (curopensegs - 1) * RELSEG_SIZE;
                               1026                 : 
                               1027             387 :         v = &reln->md_seg_fds[forknum][curopensegs - 1];
                               1028                 : 
 8620 tgl                      1029             387 :         if (priorblocks > nblocks)
                               1030                 :         {
                               1031                 :             /*
                               1032                 :              * This segment is no longer active. We truncate the file, but do
                               1033                 :              * not delete it, for reasons explained in the header comments.
                               1034                 :              */
 2213 rhaas                    1035 UIC           0 :             if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
 5940 tgl                      1036               0 :                 ereport(ERROR,
                               1037                 :                         (errcode_for_file_access(),
                               1038                 :                          errmsg("could not truncate file \"%s\": %m",
 4995 heikki.linnakangas       1039 ECB             :                                 FilePathName(v->mdfd_vfd))));
                               1040                 : 
 4622 rhaas                    1041 UBC           0 :             if (!SmgrIsTemp(reln))
 5354 heikki.linnakangas       1042               0 :                 register_dirty_segment(reln, forknum, v);
                               1043                 : 
                               1044                 :             /* we never drop the 1st segment */
 2404 andres                   1045 UIC           0 :             Assert(v != &reln->md_seg_fds[forknum][0]);
                               1046                 : 
 2404 andres                   1047 UBC           0 :             FileClose(v->mdfd_vfd);
 2404 andres                   1048 UIC           0 :             _fdvec_resize(reln, forknum, curopensegs - 1);
                               1049                 :         }
 7956 tgl                      1050 GIC         387 :         else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
                               1051                 :         {
                               1052                 :             /*
                               1053                 :              * This is the last segment we want to keep. Truncate the file to
                               1054                 :              * the right length. NOTE: if nblocks is exactly a multiple K of
                               1055                 :              * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
 2404 andres                   1056 ECB             :              * keep it. This adheres to the invariant given in the header
                               1057                 :              * comments.
 8620 tgl                      1058                 :              */
 7836 bruce                    1059 GIC         387 :             BlockNumber lastsegblocks = nblocks - priorblocks;
                               1060                 : 
 2213 rhaas                    1061             387 :             if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
 5940 tgl                      1062 UIC           0 :                 ereport(ERROR,
                               1063                 :                         (errcode_for_file_access(),
                               1064                 :                          errmsg("could not truncate file \"%s\" to %u blocks: %m",
                               1065                 :                                 FilePathName(v->mdfd_vfd),
                               1066                 :                                 nblocks)));
 4622 rhaas                    1067 GIC         387 :             if (!SmgrIsTemp(reln))
 5354 heikki.linnakangas       1068             250 :                 register_dirty_segment(reln, forknum, v);
 8620 tgl                      1069 ECB             :         }
                               1070                 :         else
                               1071                 :         {
                               1072                 :             /*
                               1073                 :              * We still need this segment, so nothing to do for this and any
                               1074                 :              * earlier segment.
                               1075                 :              */
 2404 andres                   1076 UIC           0 :             break;
                               1077                 :         }
 2404 andres                   1078 CBC         387 :         curopensegs--;
                               1079                 :     }
                               1080                 : }
                               1081                 : 
                               1082                 : /*
                               1083                 :  *  mdimmedsync() -- Immediately sync a relation to stable storage.
                               1084                 :  *
                               1085                 :  * Note that only writes already issued are synced; this routine knows
                               1086                 :  * nothing of dirty buffers that may exist inside the buffer manager.  We
                               1087                 :  * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
                               1088                 :  * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
                               1089                 :  * some segment, then mdtruncate() renders that segment inactive.  If we
                               1090                 :  * crash before the next checkpoint syncs the newly-inactive segment, that
                               1091                 :  * segment may survive recovery, reintroducing unwanted data into the table.
                               1092                 :  */
 5940 tgl                      1093 ECB             : void
 5354 heikki.linnakangas       1094 CBC       59652 : mdimmedsync(SMgrRelation reln, ForkNumber forknum)
                               1095                 : {
                               1096                 :     int         segno;
                               1097                 :     int         min_inactive_seg;
 6885 tgl                      1098 ECB             : 
                               1099                 :     /*
 5624 bruce                    1100 EUB             :      * NOTE: mdnblocks makes sure we have opened all active segments, so that
 5624 bruce                    1101 ECB             :      * fsync loop will get them all!
 6885 tgl                      1102                 :      */
 4381 peter_e                  1103 GIC       59652 :     mdnblocks(reln, forknum);
                               1104                 : 
 1100 noah                     1105           59652 :     min_inactive_seg = segno = reln->md_num_open_segs[forknum];
                               1106                 : 
 1100 noah                     1107 EUB             :     /*
                               1108                 :      * Temporarily open inactive segments, then close them after sync.  There
                               1109                 :      * may be some inactive segments left opened after fsync() error, but that
                               1110                 :      * is harmless.  We don't bother to clean them up and take a risk of
                               1111                 :      * further trouble.  The next mdclose() will soon close them.
                               1112                 :      */
 1100 noah                     1113 GIC       59652 :     while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
 1100 noah                     1114 UIC           0 :         segno++;
                               1115                 : 
 2404 andres                   1116 GBC      119304 :     while (segno > 0)
 6885 tgl                      1117 EUB             :     {
 2404 andres                   1118 GBC       59652 :         MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
                               1119                 : 
                               1120                 :         /*
                               1121                 :          * fsyncs done through mdimmedsync() should be tracked in a separate
                               1122                 :          * IOContext than those done through mdsyncfiletag() to differentiate
                               1123                 :          * between unavoidable client backend fsyncs (e.g. those done during
                               1124                 :          * index build) and those which ideally would have been done by the
                               1125                 :          * checkpointer. Since other IO operations bypassing the buffer
                               1126                 :          * manager could also be tracked in such an IOContext, wait until
                               1127                 :          * these are also tracked to track immediate fsyncs.
                               1128                 :          */
 2213 rhaas                    1129 GIC       59652 :         if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
 1602 tmunro                   1130 UIC           0 :             ereport(data_sync_elevel(ERROR),
                               1131                 :                     (errcode_for_file_access(),
                               1132                 :                      errmsg("could not fsync file \"%s\": %m",
                               1133                 :                             FilePathName(v->mdfd_vfd))));
                               1134                 : 
 1100 noah                     1135 ECB             :         /* Close inactive segments immediately */
 1100 noah                     1136 GIC       59652 :         if (segno > min_inactive_seg)
                               1137                 :         {
 1100 noah                     1138 UIC           0 :             FileClose(v->mdfd_vfd);
                               1139               0 :             _fdvec_resize(reln, forknum, segno - 1);
                               1140                 :         }
                               1141                 : 
 2404 andres                   1142 GIC       59652 :         segno--;
                               1143                 :     }
 6885 tgl                      1144           59652 : }
 6885 tgl                      1145 ECB             : 
 9770 scrappy                  1146                 : /*
                               1147                 :  * register_dirty_segment() -- Mark a relation segment as needing fsync
                               1148                 :  *
 6887 tgl                      1149 EUB             :  * If there is a local pending-ops table, just make an entry in it for
 1466 tmunro                   1150                 :  * ProcessSyncRequests to process later.  Otherwise, try to pass off the
                               1151                 :  * fsync request to the checkpointer process.  If that fails, just do the
                               1152                 :  * fsync locally before returning (we hope this will not happen often
                               1153                 :  * enough to be a performance problem).
                               1154                 :  */
                               1155                 : static void
 5354 heikki.linnakangas       1156 CBC     1058988 : register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 9770 scrappy                  1157 ECB             : {
                               1158                 :     FileTag     tag;
                               1159                 : 
  277 rhaas                    1160 GNC     1058988 :     INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
                               1161                 : 
                               1162                 :     /* Temp relations should never be fsync'd */
 3918 tgl                      1163 CBC     1058988 :     Assert(!SmgrIsTemp(reln));
 3918 tgl                      1164 ECB             : 
 1466 tmunro                   1165 GIC     1058988 :     if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
                               1166                 :     {
                               1167                 :         instr_time  io_start;
                               1168                 : 
    2 andres                   1169             135 :         ereport(DEBUG1,
    2 andres                   1170 ECB             :                 (errmsg_internal("could not forward fsync request because request queue is full")));
                               1171                 : 
    2 andres                   1172 GNC         135 :         io_start = pgstat_prepare_io_time();
                               1173                 : 
    2 andres                   1174 CBC         135 :         if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
    2 andres                   1175 UIC           0 :             ereport(data_sync_elevel(ERROR),
    2 andres                   1176 ECB             :                     (errcode_for_file_access(),
                               1177                 :                      errmsg("could not fsync file \"%s\": %m",
                               1178                 :                             FilePathName(seg->mdfd_vfd))));
                               1179                 : 
                               1180                 :         /*
                               1181                 :          * We have no way of knowing if the current IOContext is
                               1182                 :          * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
                               1183                 :          * point, so count the fsync as being in the IOCONTEXT_NORMAL
                               1184                 :          * IOContext. This is probably okay, because the number of backend
                               1185                 :          * fsyncs doesn't say anything about the efficacy of the
                               1186                 :          * BufferAccessStrategy. And counting both fsyncs done in
                               1187                 :          * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
                               1188                 :          * IOCONTEXT_NORMAL is likely clearer when investigating the number of
                               1189                 :          * backend fsyncs.
                               1190                 :          */
    2 andres                   1191 GNC         135 :         pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                               1192                 :                                 IOOP_FSYNC, io_start, 1);
                               1193                 :     }
 9770 scrappy                  1194 GIC     1058988 : }
                               1195                 : 
 5624 tgl                      1196 EUB             : /*
 1401 akapila                  1197                 :  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
                               1198                 :  */
                               1199                 : static void
  277 rhaas                    1200 GNC       28471 : register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
                               1201                 :                         BlockNumber segno)
 5624 tgl                      1202 EUB             : {
 1466 tmunro                   1203                 :     FileTag     tag;
                               1204                 : 
  277 rhaas                    1205 GNC       28471 :     INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
 1466 tmunro                   1206 EUB             : 
                               1207                 :     /* Should never be used with temp relations */
  277 rhaas                    1208 GNC       28471 :     Assert(!RelFileLocatorBackendIsTemp(rlocator));
 3918 tgl                      1209 EUB             : 
 1466 tmunro                   1210 GIC       28471 :     RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
 5624 tgl                      1211 CBC       28471 : }
                               1212                 : 
                               1213                 : /*
                               1214                 :  * register_forget_request() -- forget any fsyncs for a relation fork's segment
                               1215                 :  */
                               1216                 : static void
  277 rhaas                    1217 GNC      109825 : register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
                               1218                 :                         BlockNumber segno)
                               1219                 : {
 1466 tmunro                   1220 ECB             :     FileTag     tag;
                               1221                 : 
  277 rhaas                    1222 GNC      109825 :     INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
 5624 bruce                    1223 EUB             : 
 1466 tmunro                   1224 GIC      109825 :     RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
 5926 tgl                      1225          109825 : }
                               1226                 : 
                               1227                 : /*
 1445 fujii                    1228 ECB             :  * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
 5926 tgl                      1229                 :  */
                               1230                 : void
 1466 tmunro                   1231 GIC          29 : ForgetDatabaseSyncRequests(Oid dbid)
                               1232                 : {
                               1233                 :     FileTag     tag;
                               1234                 :     RelFileLocator rlocator;
                               1235                 : 
  277 rhaas                    1236 GNC          29 :     rlocator.dbOid = dbid;
                               1237              29 :     rlocator.spcOid = 0;
                               1238              29 :     rlocator.relNumber = 0;
 5926 tgl                      1239 ECB             : 
  277 rhaas                    1240 GNC          29 :     INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
                               1241                 : 
 1466 tmunro                   1242 GIC          29 :     RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
 8198 vadim4o                  1243              29 : }
                               1244                 : 
                               1245                 : /*
                               1246                 :  * DropRelationFiles -- drop files of all given relations
                               1247                 :  */
                               1248                 : void
  277 rhaas                    1249 GNC        2215 : DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
                               1250                 : {
                               1251                 :     SMgrRelation *srels;
                               1252                 :     int         i;
                               1253                 : 
 1739 fujii                    1254 GIC        2215 :     srels = palloc(sizeof(SMgrRelation) * ndelrels);
 1739 fujii                    1255 CBC        8330 :     for (i = 0; i < ndelrels; i++)
                               1256                 :     {
 1739 fujii                    1257 GIC        6115 :         SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
                               1258                 : 
                               1259            6115 :         if (isRedo)
                               1260                 :         {
                               1261                 :             ForkNumber  fork;
                               1262                 : 
                               1263           30455 :             for (fork = 0; fork <= MAX_FORKNUM; fork++)
 1739 fujii                    1264 CBC       24364 :                 XLogDropRelation(delrels[i], fork);
                               1265                 :         }
                               1266            6115 :         srels[i] = srel;
                               1267                 :     }
                               1268                 : 
 1739 fujii                    1269 GIC        2215 :     smgrdounlinkall(srels, ndelrels, isRedo);
                               1270                 : 
 1474 tomas.vondra             1271            8330 :     for (i = 0; i < ndelrels; i++)
 1739 fujii                    1272            6115 :         smgrclose(srels[i]);
                               1273            2215 :     pfree(srels);
 1739 fujii                    1274 CBC        2215 : }
 1739 fujii                    1275 EUB             : 
                               1276                 : 
 9770 scrappy                  1277 ECB             : /*
                               1278                 :  *  _fdvec_resize() -- Resize the fork's open segments array
                               1279                 :  */
                               1280                 : static void
 2404 andres                   1281 GIC     2239960 : _fdvec_resize(SMgrRelation reln,
                               1282                 :               ForkNumber forknum,
                               1283                 :               int nseg)
                               1284                 : {
                               1285         2239960 :     if (nseg == 0)
                               1286                 :     {
                               1287          973456 :         if (reln->md_num_open_segs[forknum] > 0)
                               1288                 :         {
                               1289          973456 :             pfree(reln->md_seg_fds[forknum]);
 2404 andres                   1290 CBC      973456 :             reln->md_seg_fds[forknum] = NULL;
 2404 andres                   1291 EUB             :         }
                               1292                 :     }
 2404 andres                   1293 GIC     1266504 :     else if (reln->md_num_open_segs[forknum] == 0)
                               1294                 :     {
                               1295         1266504 :         reln->md_seg_fds[forknum] =
                               1296         1266504 :             MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
 2404 andres                   1297 ECB             :     }
                               1298                 :     else
 2404 andres                   1299 EUB             :     {
                               1300                 :         /*
                               1301                 :          * It doesn't seem worthwhile complicating the code to amortize
                               1302                 :          * repalloc() calls.  Those are far faster than PathNameOpenFile() or
 1185 noah                     1303 ECB             :          * FileClose(), and the memory context internally will sometimes avoid
                               1304                 :          * doing an actual reallocation.
 2404 andres                   1305                 :          */
 2404 andres                   1306 UIC           0 :         reln->md_seg_fds[forknum] =
                               1307               0 :             repalloc(reln->md_seg_fds[forknum],
                               1308                 :                      sizeof(MdfdVec) * nseg);
                               1309                 :     }
                               1310                 : 
 2404 andres                   1311 GIC     2239960 :     reln->md_num_open_segs[forknum] = nseg;
 9453 vadim4o                  1312         2239960 : }
                               1313                 : 
                               1314                 : /*
                               1315                 :  * Return the filename for the specified segment of the relation. The
                               1316                 :  * returned string is palloc'd.
 9453 vadim4o                  1317 ECB             :  */
                               1318                 : static char *
 4995 heikki.linnakangas       1319 GIC       59662 : _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
                               1320                 : {
 4790 bruce                    1321 ECB             :     char       *path,
                               1322                 :                *fullpath;
                               1323                 : 
  277 rhaas                    1324 GNC       59662 :     path = relpath(reln->smgr_rlocator, forknum);
                               1325                 : 
 9345 bruce                    1326 CBC       59662 :     if (segno > 0)
                               1327                 :     {
 3380 peter_e                  1328 GIC       59662 :         fullpath = psprintf("%s.%u", path, segno);
 8400 tgl                      1329           59662 :         pfree(path);
 9345 bruce                    1330 ECB             :     }
                               1331                 :     else
 9345 bruce                    1332 UIC           0 :         fullpath = path;
 9345 bruce                    1333 ECB             : 
 4995 heikki.linnakangas       1334 GIC       59662 :     return fullpath;
 4995 heikki.linnakangas       1335 ECB             : }
 4995 heikki.linnakangas       1336 EUB             : 
                               1337                 : /*
                               1338                 :  * Open the specified segment of the relation,
                               1339                 :  * and make a MdfdVec object for it.  Returns NULL on failure.
                               1340                 :  */
                               1341                 : static MdfdVec *
 4995 heikki.linnakangas       1342 GIC       59652 : _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
                               1343                 :               int oflags)
                               1344                 : {
                               1345                 :     MdfdVec    *v;
                               1346                 :     File        fd;
                               1347                 :     char       *fullpath;
                               1348                 : 
                               1349           59652 :     fullpath = _mdfd_segpath(reln, forknum, segno);
                               1350                 : 
                               1351                 :     /* open the file */
    1 tmunro                   1352 GNC       59652 :     fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
                               1353                 : 
 8400 tgl                      1354 GIC       59652 :     pfree(fullpath);
 9345 bruce                    1355 ECB             : 
 9345 bruce                    1356 GIC       59652 :     if (fd < 0)
 7032 neilc                    1357           59652 :         return NULL;
                               1358                 : 
                               1359                 :     /*
                               1360                 :      * Segments are always opened in order from lowest to highest, so we must
 1168 tmunro                   1361 ECB             :      * be adding a new one at the end.
                               1362                 :      */
 1168 tmunro                   1363 UIC           0 :     Assert(segno == reln->md_num_open_segs[forknum]);
                               1364                 : 
                               1365               0 :     _fdvec_resize(reln, forknum, segno + 1);
 9345 bruce                    1366 ECB             : 
                               1367                 :     /* fill the entry */
 2404 andres                   1368 UIC           0 :     v = &reln->md_seg_fds[forknum][segno];
 9345 bruce                    1369 LBC           0 :     v->mdfd_vfd = fd;
 6887 tgl                      1370 UIC           0 :     v->mdfd_segno = segno;
 2404 andres                   1371 ECB             : 
 5354 heikki.linnakangas       1372 LBC           0 :     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
                               1373                 : 
                               1374                 :     /* all done */
 8986 bruce                    1375 UIC           0 :     return v;
                               1376                 : }
                               1377                 : 
 7033 neilc                    1378 ECB             : /*
                               1379                 :  *  _mdfd_getseg() -- Find the segment of the relation holding the
                               1380                 :  *      specified block.
                               1381                 :  *
                               1382                 :  * If the segment doesn't exist, we ereport, return NULL, or create the
 4622 rhaas                    1383                 :  * segment, according to "behavior".  Note: skipFsync is only used in the
                               1384                 :  * EXTENSION_CREATE case.
 7033 neilc                    1385                 :  */
 8620 tgl                      1386                 : static MdfdVec *
 5354 heikki.linnakangas       1387 GIC     2927626 : _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
                               1388                 :              bool skipFsync, int behavior)
                               1389                 : {
                               1390                 :     MdfdVec    *v;
                               1391                 :     BlockNumber targetseg;
 6887 tgl                      1392 ECB             :     BlockNumber nextsegno;
                               1393                 : 
                               1394                 :     /* some way to handle non-existent segments needs to be specified */
 2531 andres                   1395 GIC     2927626 :     Assert(behavior &
                               1396                 :            (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
  337 tmunro                   1397 ECB             :             EXTENSION_DONT_OPEN));
 2531 andres                   1398                 : 
 5940 tgl                      1399 CBC     2927626 :     targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
                               1400                 : 
 2404 andres                   1401 ECB             :     /* if an existing and opened segment, we're done */
 2404 andres                   1402 GIC     2927626 :     if (targetseg < reln->md_num_open_segs[forknum])
 6887 tgl                      1403 ECB             :     {
 2404 andres                   1404 CBC     2678715 :         v = &reln->md_seg_fds[forknum][targetseg];
 2404 andres                   1405 GIC     2678715 :         return v;
                               1406                 :     }
                               1407                 : 
                               1408                 :     /* The caller only wants the segment if we already had it open. */
  337 tmunro                   1409          248911 :     if (behavior & EXTENSION_DONT_OPEN)
  337 tmunro                   1410 CBC         277 :         return NULL;
                               1411                 : 
                               1412                 :     /*
                               1413                 :      * The target segment is not yet open. Iterate over all the segments
                               1414                 :      * between the last opened and the target segment. This way missing
 2404 andres                   1415 ECB             :      * segments either raise an error, or get created (according to
                               1416                 :      * 'behavior'). Start with either the last opened, or the first segment if
                               1417                 :      * none was opened before.
                               1418                 :      */
 2404 andres                   1419 GIC      248634 :     if (reln->md_num_open_segs[forknum] > 0)
 2404 andres                   1420 CBC          10 :         v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
                               1421                 :     else
                               1422                 :     {
 1362 tmunro                   1423 GIC      248624 :         v = mdopenfork(reln, forknum, behavior);
 2404 andres                   1424 CBC      248621 :         if (!v)
 2404 andres                   1425 LBC           0 :             return NULL;        /* if behavior & EXTENSION_RETURN_NULL */
                               1426                 :     }
 2404 andres                   1427 ECB             : 
 2404 andres                   1428 GIC      248631 :     for (nextsegno = reln->md_num_open_segs[forknum];
                               1429          248631 :          nextsegno <= targetseg; nextsegno++)
 2404 andres                   1430 ECB             :     {
 2404 andres                   1431 GIC          10 :         BlockNumber nblocks = _mdnblocks(reln, forknum, v);
 2404 andres                   1432 CBC          10 :         int         flags = 0;
 2539 andres                   1433 ECB             : 
 2404 andres                   1434 CBC          10 :         Assert(nextsegno == v->mdfd_segno + 1);
 2404 andres                   1435 ECB             : 
 2404 andres                   1436 GIC          10 :         if (nblocks > ((BlockNumber) RELSEG_SIZE))
 2404 andres                   1437 UIC           0 :             elog(FATAL, "segment too big");
                               1438                 : 
 2404 andres                   1439 GIC          10 :         if ((behavior & EXTENSION_CREATE) ||
                               1440              10 :             (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
                               1441                 :         {
 2404 andres                   1442 ECB             :             /*
                               1443                 :              * Normally we will create new segments only if authorized by the
                               1444                 :              * caller (i.e., we are doing mdextend()).  But when doing WAL
                               1445                 :              * recovery, create segments anyway; this allows cases such as
                               1446                 :              * replaying WAL data that has a write into a high-numbered
                               1447                 :              * segment of a relation that was later deleted. We want to go
                               1448                 :              * ahead and create the segments so we can finish out the replay.
                               1449                 :              *
                               1450                 :              * We have to maintain the invariant that segments before the last
                               1451                 :              * active segment are of size RELSEG_SIZE; therefore, if
                               1452                 :              * extending, pad them out with zeroes if needed.  (This only
                               1453                 :              * matters if in recovery, or if the caller is extending the
                               1454                 :              * relation discontiguously, but that can happen in hash indexes.)
                               1455                 :              */
 2404 andres                   1456 LBC           0 :             if (nblocks < ((BlockNumber) RELSEG_SIZE))
 5940 tgl                      1457 ECB             :             {
    1 tmunro                   1458 UNC           0 :                 char       *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
                               1459                 :                                                      MCXT_ALLOC_ZERO);
                               1460                 : 
 2404 andres                   1461 UIC           0 :                 mdextend(reln, forknum,
                               1462               0 :                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
                               1463                 :                          zerobuf, skipFsync);
                               1464               0 :                 pfree(zerobuf);
                               1465                 :             }
                               1466               0 :             flags = O_CREAT;
                               1467                 :         }
 2404 andres                   1468 GBC          10 :         else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
 2404 andres                   1469 EUB             :                  nblocks < ((BlockNumber) RELSEG_SIZE))
                               1470                 :         {
                               1471                 :             /*
                               1472                 :              * When not extending (or explicitly including truncated
 2404 andres                   1473 ECB             :              * segments), only open the next segment if the current one is
                               1474                 :              * exactly RELSEG_SIZE.  If not (this branch), either return NULL
                               1475                 :              * or fail.
                               1476                 :              */
 2404 andres                   1477 GIC          10 :             if (behavior & EXTENSION_RETURN_NULL)
                               1478                 :             {
                               1479                 :                 /*
                               1480                 :                  * Some callers discern between reasons for _mdfd_getseg()
 2404 andres                   1481 ECB             :                  * returning NULL based on errno. As there's no failing
                               1482                 :                  * syscall involved in this case, explicitly set errno to
                               1483                 :                  * ENOENT, as that seems the closest interpretation.
                               1484                 :                  */
 2404 andres                   1485 UIC           0 :                 errno = ENOENT;
 2404 andres                   1486 LBC           0 :                 return NULL;
                               1487                 :             }
 2539 andres                   1488 ECB             : 
 2404 andres                   1489 GIC          10 :             ereport(ERROR,
 2404 andres                   1490 ECB             :                     (errcode_for_file_access(),
                               1491                 :                      errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
                               1492                 :                             _mdfd_segpath(reln, forknum, nextsegno),
                               1493                 :                             blkno, nblocks)));
 2404 andres                   1494 EUB             :         }
                               1495                 : 
 2404 andres                   1496 LBC           0 :         v = _mdfd_openseg(reln, forknum, nextsegno, flags);
                               1497                 : 
 2404 andres                   1498 UIC           0 :         if (v == NULL)
                               1499                 :         {
                               1500               0 :             if ((behavior & EXTENSION_RETURN_NULL) &&
                               1501               0 :                 FILE_POSSIBLY_DELETED(errno))
                               1502               0 :                 return NULL;
                               1503               0 :             ereport(ERROR,
 2404 andres                   1504 ECB             :                     (errcode_for_file_access(),
                               1505                 :                      errmsg("could not open file \"%s\" (target block %u): %m",
                               1506                 :                             _mdfd_segpath(reln, forknum, nextsegno),
                               1507                 :                             blkno)));
                               1508                 :         }
                               1509                 :     }
                               1510                 : 
 8986 bruce                    1511 CBC      248621 :     return v;
                               1512                 : }
                               1513                 : 
 8399 tgl                      1514 ECB             : /*
                               1515                 :  * Get number of blocks present in a single disk file
 8400                          1516                 :  */
                               1517                 : static BlockNumber
 5354 heikki.linnakangas       1518 CBC     4252034 : _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 9770 scrappy                  1519 ECB             : {
                               1520                 :     off_t       len;
                               1521                 : 
 1614 tmunro                   1522 GIC     4252034 :     len = FileSize(seg->mdfd_vfd);
 8397 bruce                    1523         4252034 :     if (len < 0)
 5940 tgl                      1524 UIC           0 :         ereport(ERROR,
 5940 tgl                      1525 EUB             :                 (errcode_for_file_access(),
                               1526                 :                  errmsg("could not seek to end of file \"%s\": %m",
 4995 heikki.linnakangas       1527                 :                         FilePathName(seg->mdfd_vfd))));
                               1528                 :     /* note that this calculation will ignore any partial block at EOF */
 5940 tgl                      1529 GIC     4252034 :     return (BlockNumber) (len / BLCKSZ);
 9770 scrappy                  1530 EUB             : }
 1466 tmunro                   1531                 : 
                               1532                 : /*
                               1533                 :  * Sync a file to disk, given a file tag.  Write the path into an output
                               1534                 :  * buffer so the caller can use it in error messages.
                               1535                 :  *
                               1536                 :  * Return 0 on success, -1 on failure, with errno set.
                               1537                 :  */
                               1538                 : int
 1466 tmunro                   1539 UIC           0 : mdsyncfiletag(const FileTag *ftag, char *path)
                               1540                 : {
  277 rhaas                    1541 UNC           0 :     SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
                               1542                 :     File        file;
                               1543                 :     instr_time  io_start;
                               1544                 :     bool        need_to_close;
                               1545                 :     int         result,
                               1546                 :                 save_errno;
                               1547                 : 
                               1548                 :     /* See if we already have the file open, or need to open it. */
 1212 tmunro                   1549 UIC           0 :     if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
 1212 tmunro                   1550 ECB             :     {
 1212 tmunro                   1551 UIC           0 :         file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
                               1552               0 :         strlcpy(path, FilePathName(file), MAXPGPATH);
                               1553               0 :         need_to_close = false;
                               1554                 :     }
                               1555                 :     else
                               1556                 :     {
                               1557                 :         char       *p;
 1212 tmunro                   1558 ECB             : 
 1212 tmunro                   1559 UIC           0 :         p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
                               1560               0 :         strlcpy(path, p, MAXPGPATH);
                               1561               0 :         pfree(p);
 1212 tmunro                   1562 ECB             : 
    1 tmunro                   1563 UNC           0 :         file = PathNameOpenFile(path, _mdfd_open_flags());
 1212 tmunro                   1564 UIC           0 :         if (file < 0)
 1212 tmunro                   1565 LBC           0 :             return -1;
 1212 tmunro                   1566 UIC           0 :         need_to_close = true;
 1212 tmunro                   1567 ECB             :     }
                               1568                 : 
    2 andres                   1569 UNC           0 :     io_start = pgstat_prepare_io_time();
                               1570                 : 
                               1571                 :     /* Sync the file. */
 1212 tmunro                   1572 UIC           0 :     result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
                               1573               0 :     save_errno = errno;
 1212 tmunro                   1574 ECB             : 
 1212 tmunro                   1575 LBC           0 :     if (need_to_close)
 1212 tmunro                   1576 UIC           0 :         FileClose(file);
                               1577                 : 
    2 andres                   1578 UNC           0 :     pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                               1579                 :                             IOOP_FSYNC, io_start, 1);
                               1580                 : 
 1212 tmunro                   1581 UIC           0 :     errno = save_errno;
                               1582               0 :     return result;
                               1583                 : }
                               1584                 : 
                               1585                 : /*
                               1586                 :  * Unlink a file, given a file tag.  Write the path into an output
 1466 tmunro                   1587 ECB             :  * buffer so the caller can use it in error messages.
                               1588                 :  *
                               1589                 :  * Return 0 on success, -1 on failure, with errno set.
                               1590                 :  */
                               1591                 : int
 1466 tmunro                   1592 CBC       27428 : mdunlinkfiletag(const FileTag *ftag, char *path)
 1466 tmunro                   1593 EUB             : {
                               1594                 :     char       *p;
                               1595                 : 
 1466 tmunro                   1596 ECB             :     /* Compute the path. */
  277 rhaas                    1597 GNC       27428 :     p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
 1466 tmunro                   1598 GIC       27428 :     strlcpy(path, p, MAXPGPATH);
 1466 tmunro                   1599 CBC       27428 :     pfree(p);
 1466 tmunro                   1600 ECB             : 
                               1601                 :     /* Try to unlink the file. */
 1466 tmunro                   1602 CBC       27428 :     return unlink(path);
                               1603                 : }
 1466 tmunro                   1604 ECB             : 
 1466 tmunro                   1605 EUB             : /*
                               1606                 :  * Check if a given candidate request matches a given tag, when processing
 1466 tmunro                   1607 ECB             :  * a SYNC_FILTER_REQUEST request.  This will be called for all pending
                               1608                 :  * requests to find out whether to forget them.
                               1609                 :  */
                               1610                 : bool
 1466 tmunro                   1611 GIC        4329 : mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
                               1612                 : {
                               1613                 :     /*
                               1614                 :      * For now we only use filter requests as a way to drop all scheduled
                               1615                 :      * callbacks relating to a given database, when dropping the database.
                               1616                 :      * We'll return true for all candidates that have the same database OID as
                               1617                 :      * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
                               1618                 :      */
  277 rhaas                    1619 GNC        4329 :     return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
                               1620                 : }

Generated by: LCOV version v1.16-55-g56c0a2a