Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reinit.c
4 : * Reinitialization of unlogged relations
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/reinit.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include <unistd.h>
18 :
19 : #include "common/relpath.h"
20 : #include "postmaster/startup.h"
21 : #include "storage/copydir.h"
22 : #include "storage/fd.h"
23 : #include "storage/reinit.h"
24 : #include "utils/hsearch.h"
25 : #include "utils/memutils.h"
26 :
27 : static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
28 : int op);
29 : static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
30 : int op);
31 :
32 : typedef struct
33 : {
34 : Oid reloid; /* hash key */
35 : } unlogged_relation_entry;
36 :
37 : /*
38 : * Reset unlogged relations from before the last restart.
39 : *
40 : * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
41 : * relation with an "init" fork, except for the "init" fork itself.
42 : *
43 : * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
44 : * fork.
45 : */
46 : void
4484 rhaas 47 CBC 268 : ResetUnloggedRelations(int op)
48 : {
49 : char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
50 : DIR *spc_dir;
51 : struct dirent *spc_de;
52 : MemoryContext tmpctx,
53 : oldctx;
54 :
55 : /* Log it. */
4282 peter_e 56 268 : elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
57 : (op & UNLOGGED_RELATION_CLEANUP) != 0,
58 : (op & UNLOGGED_RELATION_INIT) != 0);
59 :
60 : /*
61 : * Just to be sure we don't leak any memory, let's create a temporary
62 : * memory context for this operation.
63 : */
4484 rhaas 64 268 : tmpctx = AllocSetContextCreate(CurrentMemoryContext,
65 : "ResetUnloggedRelations",
66 : ALLOCSET_DEFAULT_SIZES);
67 268 : oldctx = MemoryContextSwitchTo(tmpctx);
68 :
69 : /* Prepare to report progress resetting unlogged relations. */
531 70 268 : begin_startup_progress_phase();
71 :
72 : /*
73 : * First process unlogged files in pg_default ($PGDATA/base)
74 : */
4484 75 268 : ResetUnloggedRelationsInTablespaceDir("base", op);
76 :
77 : /*
78 : * Cycle through directories for all non-default tablespaces.
79 : */
80 268 : spc_dir = AllocateDir("pg_tblspc");
81 :
82 877 : while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
83 : {
84 609 : if (strcmp(spc_de->d_name, ".") == 0 ||
85 341 : strcmp(spc_de->d_name, "..") == 0)
86 536 : continue;
87 :
88 73 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
4382 bruce 89 73 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
4484 rhaas 90 73 : ResetUnloggedRelationsInTablespaceDir(temp_path, op);
91 : }
92 :
93 268 : FreeDir(spc_dir);
94 :
95 : /*
96 : * Restore memory context.
97 : */
98 268 : MemoryContextSwitchTo(oldctx);
99 268 : MemoryContextDelete(tmpctx);
100 268 : }
101 :
102 : /*
103 : * Process one tablespace directory for ResetUnloggedRelations
104 : */
105 : static void
106 341 : ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
107 : {
108 : DIR *ts_dir;
109 : struct dirent *de;
110 : char dbspace_path[MAXPGPATH * 2];
111 :
112 341 : ts_dir = AllocateDir(tsdirname);
113 :
114 : /*
115 : * If we get ENOENT on a tablespace directory, log it and return. This
116 : * can happen if a previous DROP TABLESPACE crashed between removing the
117 : * tablespace directory and removing the symlink in pg_tblspc. We don't
118 : * really want to prevent database startup in that scenario, so let it
119 : * pass instead. Any other type of error will be reported by ReadDir
120 : * (causing a startup failure).
121 : */
1952 tgl 122 341 : if (ts_dir == NULL && errno == ENOENT)
123 : {
1952 tgl 124 UBC 0 : ereport(LOG,
125 : (errcode_for_file_access(),
126 : errmsg("could not open directory \"%s\": %m",
127 : tsdirname)));
4484 rhaas 128 0 : return;
129 : }
130 :
4484 rhaas 131 CBC 1981 : while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
132 : {
133 : /*
134 : * We're only interested in the per-database directories, which have
135 : * numeric names. Note that this code will also (properly) ignore "."
136 : * and "..".
137 : */
1952 tgl 138 1640 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
4484 rhaas 139 730 : continue;
140 :
141 910 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
142 910 : tsdirname, de->d_name);
143 :
531 144 910 : if (op & UNLOGGED_RELATION_INIT)
145 402 : ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s",
146 : dbspace_path);
147 508 : else if (op & UNLOGGED_RELATION_CLEANUP)
148 508 : ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
149 : dbspace_path);
150 :
4484 151 910 : ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
152 : }
153 :
154 341 : FreeDir(ts_dir);
155 : }
156 :
157 : /*
158 : * Process one per-dbspace directory for ResetUnloggedRelations
159 : */
160 : static void
161 910 : ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
162 : {
163 : DIR *dbspace_dir;
164 : struct dirent *de;
165 : char rm_path[MAXPGPATH * 2];
166 :
167 : /* Caller must specify at least one operation. */
168 910 : Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
169 :
170 : /*
171 : * Cleanup is a two-pass operation. First, we go through and identify all
172 : * the files with init forks. Then, we go through again and nuke
173 : * everything with the same OID except the init fork.
174 : */
175 910 : if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
176 : {
177 : HTAB *hash;
178 : HASHCTL ctl;
179 :
180 : /*
181 : * It's possible that someone could create a ton of unlogged relations
182 : * in the same database & tablespace, so we'd better use a hash table
183 : * rather than an array or linked list to keep track of which files
184 : * need to be reset. Otherwise, this cleanup operation would be
185 : * O(n^2).
186 : */
193 187 508 : ctl.keysize = sizeof(Oid);
4484 188 508 : ctl.entrysize = sizeof(unlogged_relation_entry);
845 tgl 189 508 : ctl.hcxt = CurrentMemoryContext;
193 rhaas 190 508 : hash = hash_create("unlogged relation OIDs", 32, &ctl,
191 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
192 :
193 : /* Scan the directory. */
1952 tgl 194 508 : dbspace_dir = AllocateDir(dbspacedirname);
4484 rhaas 195 143499 : while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
196 : {
197 : ForkNumber forkNum;
198 : int relnumchars;
199 : unlogged_relation_entry ent;
200 :
201 : /* Skip anything that doesn't look like a relation data file. */
255 rhaas 202 GNC 142991 : if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
203 : &forkNum))
4484 rhaas 204 CBC 142984 : continue;
205 :
206 : /* Also skip it unless this is the init fork. */
207 141031 : if (forkNum != INIT_FORKNUM)
208 141024 : continue;
209 :
210 : /*
211 : * Put the OID portion of the name into the hash table, if it
212 : * isn't already.
213 : */
193 214 7 : ent.reloid = atooid(de->d_name);
845 tgl 215 7 : (void) hash_search(hash, &ent, HASH_ENTER, NULL);
216 : }
217 :
218 : /* Done with the first pass. */
4484 rhaas 219 508 : FreeDir(dbspace_dir);
220 :
221 : /*
222 : * If we didn't find any init forks, there's no point in continuing;
223 : * we can bail out now.
224 : */
225 508 : if (hash_get_num_entries(hash) == 0)
226 : {
227 505 : hash_destroy(hash);
228 505 : return;
229 : }
230 :
231 : /*
232 : * Now, make a second pass and remove anything that matches.
233 : */
234 3 : dbspace_dir = AllocateDir(dbspacedirname);
235 622 : while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
236 : {
237 : ForkNumber forkNum;
238 : int relnumchars;
239 : unlogged_relation_entry ent;
240 :
241 : /* Skip anything that doesn't look like a relation data file. */
255 rhaas 242 GNC 619 : if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
243 : &forkNum))
4484 rhaas 244 CBC 17 : continue;
245 :
246 : /* We never remove the init fork. */
247 609 : if (forkNum == INIT_FORKNUM)
248 7 : continue;
249 :
250 : /*
251 : * See whether the OID portion of the name shows up in the hash
252 : * table. If so, nuke it!
253 : */
193 254 602 : ent.reloid = atooid(de->d_name);
845 tgl 255 602 : if (hash_search(hash, &ent, HASH_FIND, NULL))
256 : {
4484 rhaas 257 7 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
4382 bruce 258 7 : dbspacedirname, de->d_name);
1952 tgl 259 7 : if (unlink(rm_path) < 0)
1952 tgl 260 UBC 0 : ereport(ERROR,
261 : (errcode_for_file_access(),
262 : errmsg("could not remove file \"%s\": %m",
263 : rm_path)));
264 : else
4484 rhaas 265 CBC 7 : elog(DEBUG2, "unlinked file \"%s\"", rm_path);
266 : }
267 : }
268 :
269 : /* Cleanup is complete. */
270 3 : FreeDir(dbspace_dir);
271 3 : hash_destroy(hash);
272 : }
273 :
274 : /*
275 : * Initialization happens after cleanup is complete: we copy each init
276 : * fork file to the corresponding main fork file. Note that if we are
277 : * asked to do both cleanup and init, we may never get here: if the
278 : * cleanup code determines that there are no init forks in this dbspace,
279 : * it will return before we get to this point.
280 : */
281 405 : if ((op & UNLOGGED_RELATION_INIT) != 0)
282 : {
283 : /* Scan the directory. */
1952 tgl 284 402 : dbspace_dir = AllocateDir(dbspacedirname);
4484 rhaas 285 111581 : while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
286 : {
287 : ForkNumber forkNum;
288 : int relnumchars;
289 : char relnumbuf[OIDCHARS + 1];
290 : char srcpath[MAXPGPATH * 2];
291 : char dstpath[MAXPGPATH];
292 :
293 : /* Skip anything that doesn't look like a relation data file. */
255 rhaas 294 GNC 111179 : if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
295 : &forkNum))
4484 rhaas 296 CBC 111172 : continue;
297 :
298 : /* Also skip it unless this is the init fork. */
299 109625 : if (forkNum != INIT_FORKNUM)
300 109618 : continue;
301 :
302 : /* Construct source pathname. */
303 7 : snprintf(srcpath, sizeof(srcpath), "%s/%s",
304 7 : dbspacedirname, de->d_name);
305 :
306 : /* Construct destination pathname. */
255 rhaas 307 GNC 7 : memcpy(relnumbuf, de->d_name, relnumchars);
308 7 : relnumbuf[relnumchars] = '\0';
4484 rhaas 309 CBC 7 : snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
255 rhaas 310 GNC 7 : dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
4484 rhaas 311 CBC 7 : strlen(forkNames[INIT_FORKNUM]));
312 :
313 : /* OK, we're ready to perform the actual copy. */
314 7 : elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
315 7 : copy_file(srcpath, dstpath);
316 : }
317 :
318 402 : FreeDir(dbspace_dir);
319 :
320 : /*
321 : * copy_file() above has already called pg_flush_data() on the files
322 : * it created. Now we need to fsync those files, because a checkpoint
323 : * won't do it for us while we're in recovery. We do this in a
324 : * separate pass to allow the kernel to perform all the flushes
325 : * (especially the metadata ones) at once.
326 : */
3068 andres 327 402 : dbspace_dir = AllocateDir(dbspacedirname);
328 111588 : while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
329 : {
330 : ForkNumber forkNum;
331 : int relnumchars;
332 : char relnumbuf[OIDCHARS + 1];
333 : char mainpath[MAXPGPATH];
334 :
335 : /* Skip anything that doesn't look like a relation data file. */
255 rhaas 336 GNC 111186 : if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
337 : &forkNum))
3068 andres 338 CBC 111179 : continue;
339 :
340 : /* Also skip it unless this is the init fork. */
341 109632 : if (forkNum != INIT_FORKNUM)
342 109625 : continue;
343 :
344 : /* Construct main fork pathname. */
255 rhaas 345 GNC 7 : memcpy(relnumbuf, de->d_name, relnumchars);
346 7 : relnumbuf[relnumchars] = '\0';
3068 andres 347 CBC 7 : snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
255 rhaas 348 GNC 7 : dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
3068 andres 349 CBC 7 : strlen(forkNames[INIT_FORKNUM]));
350 :
351 7 : fsync_fname(mainpath, false);
352 : }
353 :
354 402 : FreeDir(dbspace_dir);
355 :
356 : /*
357 : * Lastly, fsync the database directory itself, ensuring the
358 : * filesystem remembers the file creations and deletions we've done.
359 : * We don't bother with this during a call that does only
360 : * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
361 : * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
362 : * too at the next startup attempt.
363 : */
2587 364 402 : fsync_fname(dbspacedirname, true);
365 : }
366 : }
367 :
368 : /*
369 : * Basic parsing of putative relation filenames.
370 : *
371 : * This function returns true if the file appears to be in the correct format
372 : * for a non-temporary relation and false otherwise.
373 : *
374 : * NB: If this function returns true, the caller is entitled to assume that
375 : * *relnumchars has been set to a value no more than OIDCHARS, and thus
376 : * that a buffer of OIDCHARS+1 characters is sufficient to hold the
377 : * RelFileNumber portion of the filename. This is critical to protect against
378 : * a possible buffer overrun.
379 : */
380 : bool
255 rhaas 381 GNC 591675 : parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
382 : ForkNumber *fork)
383 : {
384 : int pos;
385 :
386 : /* Look for a non-empty string of digits (that isn't too long). */
4484 rhaas 387 CBC 2958120 : for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
388 : ;
193 389 591675 : if (pos == 0 || pos > OIDCHARS)
4484 390 6630 : return false;
255 rhaas 391 GNC 585045 : *relnumchars = pos;
392 :
393 : /* Check for a fork name. */
4484 rhaas 394 CBC 585045 : if (name[pos] != '_')
395 439278 : *fork = MAIN_FORKNUM;
396 : else
397 : {
398 : int forkchar;
399 :
4382 bruce 400 145767 : forkchar = forkname_chars(&name[pos + 1], fork);
4484 rhaas 401 145767 : if (forkchar <= 0)
4484 rhaas 402 UBC 0 : return false;
4484 rhaas 403 CBC 145767 : pos += forkchar + 1;
404 : }
405 :
406 : /* Check for a segment number. */
407 585045 : if (name[pos] == '.')
408 : {
409 : int segchar;
410 :
4382 bruce 411 UBC 0 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
412 : ;
4484 rhaas 413 0 : if (segchar <= 1)
414 0 : return false;
415 0 : pos += segchar;
416 : }
417 :
418 : /* Now we should be at the end. */
4484 rhaas 419 CBC 585045 : if (name[pos] != '\0')
4484 rhaas 420 UBC 0 : return false;
4484 rhaas 421 CBC 585045 : return true;
422 : }
|