Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dsm_impl.c
4 : * manage dynamic shared memory segments
5 : *
6 : * This file provides low-level APIs for creating and destroying shared
7 : * memory segments using several different possible techniques. We refer
8 : * to these segments as dynamic because they can be created, altered, and
9 : * destroyed at any point during the server life cycle. This is unlike
10 : * the main shared memory segment, of which there is always exactly one
11 : * and which is always mapped at a fixed address in every PostgreSQL
12 : * background process.
13 : *
14 : * Because not all systems provide the same primitives in this area, nor
15 : * do all primitives behave the same way on all systems, we provide
16 : * several implementations of this facility. Many systems implement
17 : * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 : * in this area, with the exception that shared memory identifiers live
19 : * in a flat system-wide namespace, raising the uncomfortable prospect of
20 : * name collisions with other processes (including other copies of
21 : * PostgreSQL) running on the same system. Some systems only support
22 : * the older System V shared memory interface (shmget etc.) which is
23 : * also usable; however, the default allocation limits are often quite
24 : * small, and the namespace is even more restricted.
25 : *
26 : * We also provide an mmap-based shared memory implementation. This may
27 : * be useful on systems that provide shared memory via a special-purpose
28 : * filesystem; by opting for this implementation, the user can even
29 : * control precisely where their shared memory segments are placed. It
30 : * can also be used as a fallback for systems where shm_open and shmget
31 : * are not available or can't be used for some reason. Of course,
32 : * mapping a file residing on an actual spinning disk is a fairly poor
33 : * approximation for shared memory because writeback may hurt performance
34 : * substantially, but there should be few systems where we must make do
35 : * with such poor tools.
36 : *
37 : * As ever, Windows requires its own implementation.
38 : *
39 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
40 : * Portions Copyright (c) 1994, Regents of the University of California
41 : *
42 : *
43 : * IDENTIFICATION
44 : * src/backend/storage/ipc/dsm_impl.c
45 : *
46 : *-------------------------------------------------------------------------
47 : */
48 :
49 : #include "postgres.h"
50 :
51 : #include <fcntl.h>
52 : #include <signal.h>
53 : #include <unistd.h>
54 : #ifndef WIN32
55 : #include <sys/mman.h>
56 : #include <sys/ipc.h>
57 : #include <sys/shm.h>
58 : #include <sys/stat.h>
59 : #endif
60 :
61 : #include "common/file_perm.h"
62 : #include "libpq/pqsignal.h"
63 : #include "miscadmin.h"
64 : #include "pgstat.h"
65 : #include "portability/mem.h"
66 : #include "postmaster/postmaster.h"
67 : #include "storage/dsm_impl.h"
68 : #include "storage/fd.h"
69 : #include "utils/guc.h"
70 : #include "utils/memutils.h"
71 :
72 : #ifdef USE_DSM_POSIX
73 : static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
74 : void **impl_private, void **mapped_address,
75 : Size *mapped_size, int elevel);
76 : static int dsm_impl_posix_resize(int fd, off_t size);
77 : #endif
78 : #ifdef USE_DSM_SYSV
79 : static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
80 : void **impl_private, void **mapped_address,
81 : Size *mapped_size, int elevel);
82 : #endif
83 : #ifdef USE_DSM_WINDOWS
84 : static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
85 : void **impl_private, void **mapped_address,
86 : Size *mapped_size, int elevel);
87 : #endif
88 : #ifdef USE_DSM_MMAP
89 : static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
90 : void **impl_private, void **mapped_address,
91 : Size *mapped_size, int elevel);
92 : #endif
93 : static int errcode_for_dynamic_shared_memory(void);
94 :
95 : const struct config_enum_entry dynamic_shared_memory_options[] = {
96 : #ifdef USE_DSM_POSIX
97 : {"posix", DSM_IMPL_POSIX, false},
98 : #endif
99 : #ifdef USE_DSM_SYSV
100 : {"sysv", DSM_IMPL_SYSV, false},
101 : #endif
102 : #ifdef USE_DSM_WINDOWS
103 : {"windows", DSM_IMPL_WINDOWS, false},
104 : #endif
105 : #ifdef USE_DSM_MMAP
106 : {"mmap", DSM_IMPL_MMAP, false},
107 : #endif
108 : {NULL, 0, false}
109 : };
110 :
111 : /* Implementation selector. */
112 : int dynamic_shared_memory_type = DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE;
113 :
114 : /* Amount of space reserved for DSM segments in the main area. */
115 : int min_dynamic_shared_memory;
116 :
117 : /* Size of buffer to be used for zero-filling. */
118 : #define ZBUFFER_SIZE 8192
119 :
120 : #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
121 :
122 : /*------
123 : * Perform a low-level shared memory operation in a platform-specific way,
124 : * as dictated by the selected implementation. Each implementation is
125 : * required to implement the following primitives.
126 : *
127 : * DSM_OP_CREATE. Create a segment whose size is the request_size and
128 : * map it.
129 : *
130 : * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
131 : *
132 : * DSM_OP_DETACH. Unmap the segment.
133 : *
134 : * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
135 : * segment.
136 : *
137 : * Arguments:
138 : * op: The operation to be performed.
139 : * handle: The handle of an existing object, or for DSM_OP_CREATE, the
140 : * a new handle the caller wants created.
141 : * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
142 : * impl_private: Private, implementation-specific data. Will be a pointer
143 : * to NULL for the first operation on a shared memory segment within this
144 : * backend; thereafter, it will point to the value to which it was set
145 : * on the previous call.
146 : * mapped_address: Pointer to start of current mapping; pointer to NULL
147 : * if none. Updated with new mapping address.
148 : * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
149 : * Updated with new mapped size.
150 : * elevel: Level at which to log errors.
151 : *
152 : * Return value: true on success, false on failure. When false is returned,
153 : * a message should first be logged at the specified elevel, except in the
154 : * case where DSM_OP_CREATE experiences a name collision, which should
3469 rhaas 155 ECB : * silently return false.
156 : *-----
157 : */
158 : bool
3450 rhaas 159 CBC 37705 : dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
3450 rhaas 160 ECB : void **impl_private, void **mapped_address, Size *mapped_size,
161 : int elevel)
162 : {
1615 tmunro 163 CBC 37705 : Assert(op == DSM_OP_CREATE || request_size == 0);
3469 rhaas 164 GIC 37705 : Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
165 : (*mapped_address == NULL && *mapped_size == 0));
3469 rhaas 166 ECB :
3469 rhaas 167 CBC 37705 : switch (dynamic_shared_memory_type)
168 : {
169 : #ifdef USE_DSM_POSIX
3469 rhaas 170 GIC 37705 : case DSM_IMPL_POSIX:
3469 rhaas 171 GBC 37705 : return dsm_impl_posix(op, handle, request_size, impl_private,
3469 rhaas 172 EUB : mapped_address, mapped_size, elevel);
173 : #endif
174 : #ifdef USE_DSM_SYSV
3469 rhaas 175 UIC 0 : case DSM_IMPL_SYSV:
176 0 : return dsm_impl_sysv(op, handle, request_size, impl_private,
177 : mapped_address, mapped_size, elevel);
178 : #endif
179 : #ifdef USE_DSM_WINDOWS
180 : case DSM_IMPL_WINDOWS:
3469 rhaas 181 EUB : return dsm_impl_windows(op, handle, request_size, impl_private,
182 : mapped_address, mapped_size, elevel);
183 : #endif
184 : #ifdef USE_DSM_MMAP
3469 rhaas 185 UBC 0 : case DSM_IMPL_MMAP:
186 0 : return dsm_impl_mmap(op, handle, request_size, impl_private,
187 : mapped_address, mapped_size, elevel);
188 : #endif
3456 tgl 189 UIC 0 : default:
190 0 : elog(ERROR, "unexpected dynamic shared memory type: %d",
191 : dynamic_shared_memory_type);
192 : return false;
193 : }
194 : }
195 :
196 : #ifdef USE_DSM_POSIX
197 : /*
198 : * Operating system primitives to support POSIX shared memory.
199 : *
200 : * POSIX shared memory segments are created and attached using shm_open()
201 : * and shm_unlink(); other operations, such as sizing or mapping the
202 : * segment, are performed as if the shared memory segments were files.
203 : *
204 : * Indeed, on some platforms, they may be implemented that way. While
205 : * POSIX shared memory segments seem intended to exist in a flat namespace,
206 : * some operating systems may implement them as files, even going so far
207 : * to treat a request for /xyz as a request to create a file by that name
3469 rhaas 208 ECB : * in the root directory. Users of such broken platforms should select
209 : * a different shared memory implementation.
210 : */
211 : static bool
3450 rhaas 212 GIC 37705 : dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
213 : void **impl_private, void **mapped_address, Size *mapped_size,
214 : int elevel)
215 : {
216 : char name[64];
3260 bruce 217 ECB : int flags;
218 : int fd;
219 : char *address;
3469 rhaas 220 :
3469 rhaas 221 GIC 37705 : snprintf(name, 64, "/PostgreSQL.%u", handle);
3469 rhaas 222 ECB :
223 : /* Handle teardown cases. */
3469 rhaas 224 GIC 37705 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
3469 rhaas 225 EUB : {
3469 rhaas 226 GIC 19778 : if (*mapped_address != NULL
227 17928 : && munmap(*mapped_address, *mapped_size) != 0)
228 : {
3469 rhaas 229 UBC 0 : ereport(elevel,
230 : (errcode_for_dynamic_shared_memory(),
2118 tgl 231 ECB : errmsg("could not unmap shared memory segment \"%s\": %m",
232 : name)));
3469 rhaas 233 LBC 0 : return false;
234 : }
3469 rhaas 235 GBC 19778 : *mapped_address = NULL;
3469 rhaas 236 GIC 19778 : *mapped_size = 0;
237 19778 : if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
238 : {
3469 rhaas 239 UBC 0 : ereport(elevel,
240 : (errcode_for_dynamic_shared_memory(),
2118 tgl 241 ECB : errmsg("could not remove shared memory segment \"%s\": %m",
242 : name)));
3469 rhaas 243 UIC 0 : return false;
244 : }
3469 rhaas 245 GIC 19778 : return true;
246 : }
247 :
248 : /*
249 : * Create new segment or open an existing one for attach.
250 : *
251 : * Even though we will close the FD before returning, it seems desirable
1140 tgl 252 ECB : * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
253 : * failure. The fact that we won't hold the FD open long justifies using
254 : * ReserveExternalFD rather than AcquireExternalFD, though.
3469 rhaas 255 : */
1140 tgl 256 GIC 17927 : ReserveExternalFD();
1140 tgl 257 EUB :
3469 rhaas 258 GBC 17927 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
1828 sfrost 259 17927 : if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
260 : {
1140 tgl 261 UIC 0 : ReleaseExternalFD();
282 tmunro 262 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
3469 rhaas 263 UBC 0 : ereport(elevel,
264 : (errcode_for_dynamic_shared_memory(),
265 : errmsg("could not open shared memory segment \"%s\": %m",
266 : name)));
3469 rhaas 267 UIC 0 : return false;
268 : }
269 :
3469 rhaas 270 ECB : /*
271 : * If we're attaching the segment, determine the current size; if we are
272 : * creating the segment, set the size to the requested value.
273 : */
3469 rhaas 274 CBC 17927 : if (op == DSM_OP_ATTACH)
275 : {
276 : struct stat st;
277 :
3469 rhaas 278 GIC 14277 : if (fstat(fd, &st) != 0)
3469 rhaas 279 EUB : {
3260 bruce 280 : int save_errno;
3469 rhaas 281 :
282 : /* Back out what's already been done. */
3469 rhaas 283 UIC 0 : save_errno = errno;
3469 rhaas 284 UBC 0 : close(fd);
1140 tgl 285 UIC 0 : ReleaseExternalFD();
3469 rhaas 286 0 : errno = save_errno;
287 :
3469 rhaas 288 UBC 0 : ereport(elevel,
289 : (errcode_for_dynamic_shared_memory(),
3469 rhaas 290 ECB : errmsg("could not stat shared memory segment \"%s\": %m",
291 : name)));
3469 rhaas 292 LBC 0 : return false;
293 : }
3469 rhaas 294 GIC 14277 : request_size = st.st_size;
295 : }
1615 tmunro 296 3650 : else if (dsm_impl_posix_resize(fd, request_size) != 0)
3469 rhaas 297 EUB : {
3260 bruce 298 : int save_errno;
3469 rhaas 299 :
300 : /* Back out what's already been done. */
3469 rhaas 301 UBC 0 : save_errno = errno;
3469 rhaas 302 UIC 0 : close(fd);
1140 tgl 303 UBC 0 : ReleaseExternalFD();
1615 tmunro 304 UIC 0 : shm_unlink(name);
3469 rhaas 305 0 : errno = save_errno;
306 :
3469 rhaas 307 UBC 0 : ereport(elevel,
308 : (errcode_for_dynamic_shared_memory(),
309 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
310 : name, request_size)));
3469 rhaas 311 LBC 0 : return false;
312 : }
3469 rhaas 313 ECB :
314 : /* Map it. */
3260 bruce 315 GIC 17927 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
316 : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
3469 rhaas 317 17927 : if (address == MAP_FAILED)
3469 rhaas 318 EUB : {
3260 bruce 319 : int save_errno;
3469 rhaas 320 :
321 : /* Back out what's already been done. */
3469 rhaas 322 UBC 0 : save_errno = errno;
323 0 : close(fd);
1140 tgl 324 UIC 0 : ReleaseExternalFD();
3469 rhaas 325 UBC 0 : if (op == DSM_OP_CREATE)
3469 rhaas 326 UIC 0 : shm_unlink(name);
327 0 : errno = save_errno;
328 :
3469 rhaas 329 UBC 0 : ereport(elevel,
330 : (errcode_for_dynamic_shared_memory(),
3469 rhaas 331 ECB : errmsg("could not map shared memory segment \"%s\": %m",
332 : name)));
3469 rhaas 333 LBC 0 : return false;
3469 rhaas 334 ECB : }
3469 rhaas 335 GIC 17927 : *mapped_address = address;
3469 rhaas 336 CBC 17927 : *mapped_size = request_size;
3469 rhaas 337 GIC 17927 : close(fd);
1140 tgl 338 17927 : ReleaseExternalFD();
339 :
3469 rhaas 340 17927 : return true;
341 : }
342 :
343 : /*
344 : * Set the size of a virtual memory region associated with a file descriptor.
345 : * If necessary, also ensure that virtual memory is actually allocated by the
346 : * operating system, to avoid nasty surprises later.
2022 tgl 347 ECB : *
348 : * Returns non-zero if either truncation or allocation fails, and sets errno.
349 : */
350 : static int
2022 tgl 351 GIC 3650 : dsm_impl_posix_resize(int fd, off_t size)
352 : {
353 : int rc;
354 : int save_errno;
355 : sigset_t save_sigmask;
356 :
357 : /*
358 : * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
270 tmunro 359 ECB : * for quite a long time, and is an all-or-nothing operation. If we
360 : * allowed SIGUSR1 to interrupt us repeatedly (for example, due to recovery
361 : * conflicts), the retry loop might never succeed.
362 : */
268 tmunro 363 GIC 3650 : if (IsUnderPostmaster)
267 364 1205 : sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
365 :
269 tmunro 366 GNC 3650 : pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE);
367 : #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
368 : /*
369 : * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
370 : * ftruncate, the file would contain a hole. Accessing memory backed by a
371 : * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
372 : * is no more tmpfs space available. So we ask tmpfs to allocate pages
373 : * here, so we can fail gracefully with ENOSPC now rather than risking
2022 tgl 374 ECB : * SIGBUS later.
375 : *
376 : * We still use a traditional EINTR retry loop to handle SIGCONT.
377 : * posix_fallocate() doesn't restart automatically, and we don't want
378 : * this to fail if you attach a debugger.
379 : */
380 : do
381 : {
269 tmunro 382 GNC 3650 : rc = posix_fallocate(fd, 0, size);
383 3650 : } while (rc == EINTR);
269 tmunro 384 ECB :
385 : /*
386 : * The caller expects errno to be set, but posix_fallocate() doesn't
387 : * set it. Instead it returns error numbers directly. So set errno,
388 : * even though we'll also return rc to indicate success or failure.
389 : */
269 tmunro 390 GNC 3650 : errno = rc;
391 : #else
392 : /* Extend the file to the requested size. */
393 : do
394 : {
395 : rc = ftruncate(fd, size);
396 : } while (rc < 0 && errno == EINTR);
397 : #endif
398 3650 : pgstat_report_wait_end();
2022 tgl 399 ECB :
268 tmunro 400 GIC 3650 : if (IsUnderPostmaster)
401 : {
402 1205 : save_errno = errno;
267 403 1205 : sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
268 404 1205 : errno = save_errno;
405 : }
406 :
2022 tgl 407 3650 : return rc;
408 : }
409 :
410 : #endif /* USE_DSM_POSIX */
411 :
412 : #ifdef USE_DSM_SYSV
413 : /*
3469 rhaas 414 EUB : * Operating system primitives to support System V shared memory.
415 : *
416 : * System V shared memory segments are manipulated using shmget(), shmat(),
417 : * shmdt(), and shmctl(). As the default allocation limits for System V
418 : * shared memory are usually quite low, the POSIX facilities may be
419 : * preferable; but those are not supported everywhere.
420 : */
421 : static bool
3450 rhaas 422 UIC 0 : dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
423 : void **impl_private, void **mapped_address, Size *mapped_size,
424 : int elevel)
425 : {
426 : key_t key;
427 : int ident;
428 : char *address;
3260 bruce 429 EUB : char name[64];
430 : int *ident_cache;
431 :
432 : /*
433 : * POSIX shared memory and mmap-based shared memory identify segments with
434 : * names. To avoid needless error message variation, we use the handle as
435 : * the name.
436 : */
3469 rhaas 437 UIC 0 : snprintf(name, 64, "%u", handle);
438 :
439 : /*
440 : * The System V shared memory namespace is very restricted; names are of
441 : * type key_t, which is expected to be some sort of integer data type, but
442 : * not necessarily the same one as dsm_handle. Since we use dsm_handle to
443 : * identify shared memory segments across processes, this might seem like
444 : * a problem, but it's really not. If dsm_handle is bigger than key_t,
445 : * the cast below might truncate away some bits from the handle the
446 : * user-provided, but it'll truncate exactly the same bits away in exactly
3260 bruce 447 EUB : * the same fashion every time we use that handle, which is all that
448 : * really matters. Conversely, if dsm_handle is smaller than key_t, we
449 : * won't use the full range of available key space, but that's no big deal
450 : * either.
451 : *
452 : * We do make sure that the key isn't negative, because that might not be
453 : * portable.
454 : */
3469 rhaas 455 UIC 0 : key = (key_t) handle;
3260 bruce 456 0 : if (key < 1) /* avoid compiler warning if type is unsigned */
3469 rhaas 457 0 : key = -key;
3469 rhaas 458 EUB :
459 : /*
460 : * There's one special key, IPC_PRIVATE, which can't be used. If we end
3260 bruce 461 : * up with that value by chance during a create operation, just pretend it
462 : * already exists, so that caller will retry. If we run into it anywhere
463 : * else, the caller has passed a handle that doesn't correspond to
464 : * anything we ever created, which should not happen.
465 : */
3469 rhaas 466 UIC 0 : if (key == IPC_PRIVATE)
467 : {
468 0 : if (op != DSM_OP_CREATE)
469 0 : elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
470 0 : errno = EEXIST;
3469 rhaas 471 UBC 0 : return false;
472 : }
3469 rhaas 473 EUB :
474 : /*
475 : * Before we can do anything with a shared memory segment, we have to map
476 : * the shared memory key to a shared memory identifier using shmget(). To
477 : * avoid repeated lookups, we store the key using impl_private.
478 : */
3469 rhaas 479 UIC 0 : if (*impl_private != NULL)
480 : {
481 0 : ident_cache = *impl_private;
482 0 : ident = *ident_cache;
483 : }
484 : else
3469 rhaas 485 EUB : {
3260 bruce 486 UIC 0 : int flags = IPCProtection;
487 : size_t segsize;
488 :
489 : /*
490 : * Allocate the memory BEFORE acquiring the resource, so that we don't
491 : * leak the resource if memory allocation fails.
3469 rhaas 492 EUB : */
3469 rhaas 493 UIC 0 : ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
3469 rhaas 494 EUB :
495 : /*
496 : * When using shmget to find an existing segment, we must pass the
497 : * size as 0. Passing a non-zero size which is greater than the
498 : * actual size will result in EINVAL.
499 : */
3469 rhaas 500 UBC 0 : segsize = 0;
501 :
502 0 : if (op == DSM_OP_CREATE)
503 : {
504 0 : flags |= IPC_CREAT | IPC_EXCL;
3469 rhaas 505 UIC 0 : segsize = request_size;
3469 rhaas 506 EUB : }
507 :
3469 rhaas 508 UBC 0 : if ((ident = shmget(key, segsize, flags)) == -1)
509 : {
282 tmunro 510 UIC 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
511 : {
3260 bruce 512 UBC 0 : int save_errno = errno;
513 :
3469 rhaas 514 UIC 0 : pfree(ident_cache);
3469 rhaas 515 UBC 0 : errno = save_errno;
516 0 : ereport(elevel,
517 : (errcode_for_dynamic_shared_memory(),
518 : errmsg("could not get shared memory segment: %m")));
519 : }
520 0 : return false;
521 : }
3469 rhaas 522 EUB :
3469 rhaas 523 UBC 0 : *ident_cache = ident;
524 0 : *impl_private = ident_cache;
525 : }
3469 rhaas 526 EUB :
527 : /* Handle teardown cases. */
3469 rhaas 528 UIC 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
529 : {
3469 rhaas 530 UBC 0 : pfree(ident_cache);
3469 rhaas 531 UIC 0 : *impl_private = NULL;
3469 rhaas 532 UBC 0 : if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
3469 rhaas 533 EUB : {
3469 rhaas 534 UBC 0 : ereport(elevel,
535 : (errcode_for_dynamic_shared_memory(),
2118 tgl 536 EUB : errmsg("could not unmap shared memory segment \"%s\": %m",
537 : name)));
3469 rhaas 538 UIC 0 : return false;
539 : }
3469 rhaas 540 UBC 0 : *mapped_address = NULL;
3469 rhaas 541 UIC 0 : *mapped_size = 0;
3469 rhaas 542 UBC 0 : if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
543 : {
3469 rhaas 544 UIC 0 : ereport(elevel,
545 : (errcode_for_dynamic_shared_memory(),
2118 tgl 546 EUB : errmsg("could not remove shared memory segment \"%s\": %m",
547 : name)));
3469 rhaas 548 UIC 0 : return false;
549 : }
3469 rhaas 550 UBC 0 : return true;
551 : }
3469 rhaas 552 EUB :
553 : /* If we're attaching it, we must use IPC_STAT to determine the size. */
3469 rhaas 554 UIC 0 : if (op == DSM_OP_ATTACH)
555 : {
3469 rhaas 556 EUB : struct shmid_ds shm;
557 :
3469 rhaas 558 UBC 0 : if (shmctl(ident, IPC_STAT, &shm) != 0)
559 : {
3469 rhaas 560 UIC 0 : ereport(elevel,
561 : (errcode_for_dynamic_shared_memory(),
3469 rhaas 562 EUB : errmsg("could not stat shared memory segment \"%s\": %m",
3260 bruce 563 : name)));
3469 rhaas 564 UIC 0 : return false;
565 : }
566 0 : request_size = shm.shm_segsz;
567 : }
3469 rhaas 568 EUB :
569 : /* Map it. */
3469 rhaas 570 UBC 0 : address = shmat(ident, NULL, PG_SHMAT_FLAGS);
571 0 : if (address == (void *) -1)
572 : {
3260 bruce 573 EUB : int save_errno;
574 :
575 : /* Back out what's already been done. */
3469 rhaas 576 UIC 0 : save_errno = errno;
3469 rhaas 577 UBC 0 : if (op == DSM_OP_CREATE)
3469 rhaas 578 UIC 0 : shmctl(ident, IPC_RMID, NULL);
3469 rhaas 579 UBC 0 : errno = save_errno;
3469 rhaas 580 EUB :
3469 rhaas 581 UIC 0 : ereport(elevel,
3469 rhaas 582 EUB : (errcode_for_dynamic_shared_memory(),
583 : errmsg("could not map shared memory segment \"%s\": %m",
584 : name)));
3469 rhaas 585 UIC 0 : return false;
586 : }
587 0 : *mapped_address = address;
588 0 : *mapped_size = request_size;
589 :
590 0 : return true;
591 : }
592 : #endif
593 :
594 : #ifdef USE_DSM_WINDOWS
595 : /*
596 : * Operating system primitives to support Windows shared memory.
597 : *
598 : * Windows shared memory implementation is done using file mapping
599 : * which can be backed by either physical file or system paging file.
600 : * Current implementation uses system paging file as other effects
601 : * like performance are not clear for physical file and it is used in similar
602 : * way for main shared memory in windows.
603 : *
604 : * A memory mapping object is a kernel object - they always get deleted when
605 : * the last reference to them goes away, either explicitly via a CloseHandle or
606 : * when the process containing the reference exits.
607 : */
608 : static bool
609 : dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
610 : void **impl_private, void **mapped_address,
611 : Size *mapped_size, int elevel)
612 : {
613 : char *address;
614 : HANDLE hmap;
615 : char name[64];
616 : MEMORY_BASIC_INFORMATION info;
617 :
618 : /*
619 : * Storing the shared memory segment in the Global\ namespace, can allow
620 : * any process running in any session to access that file mapping object
621 : * provided that the caller has the required access rights. But to avoid
622 : * issues faced in main shared memory, we are using the naming convention
623 : * similar to main shared memory. We can change here once issue mentioned
624 : * in GetSharedMemName is resolved.
625 : */
626 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
627 :
628 : /*
629 : * Handle teardown cases. Since Windows automatically destroys the object
630 : * when no references remain, we can treat it the same as detach.
631 : */
632 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
633 : {
634 : if (*mapped_address != NULL
635 : && UnmapViewOfFile(*mapped_address) == 0)
636 : {
637 : _dosmaperr(GetLastError());
638 : ereport(elevel,
639 : (errcode_for_dynamic_shared_memory(),
640 : errmsg("could not unmap shared memory segment \"%s\": %m",
641 : name)));
642 : return false;
643 : }
644 : if (*impl_private != NULL
645 : && CloseHandle(*impl_private) == 0)
646 : {
647 : _dosmaperr(GetLastError());
648 : ereport(elevel,
649 : (errcode_for_dynamic_shared_memory(),
650 : errmsg("could not remove shared memory segment \"%s\": %m",
651 : name)));
652 : return false;
653 : }
654 :
655 : *impl_private = NULL;
656 : *mapped_address = NULL;
657 : *mapped_size = 0;
658 : return true;
659 : }
660 :
661 : /* Create new segment or open an existing one for attach. */
662 : if (op == DSM_OP_CREATE)
663 : {
664 : DWORD size_high;
665 : DWORD size_low;
666 : DWORD errcode;
667 :
668 : /* Shifts >= the width of the type are undefined. */
669 : #ifdef _WIN64
670 : size_high = request_size >> 32;
671 : #else
672 : size_high = 0;
673 : #endif
674 : size_low = (DWORD) request_size;
675 :
676 : /* CreateFileMapping might not clear the error code on success */
677 : SetLastError(0);
678 :
679 : hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
680 : NULL, /* Default security attrs */
681 : PAGE_READWRITE, /* Memory is read/write */
682 : size_high, /* Upper 32 bits of size */
683 : size_low, /* Lower 32 bits of size */
684 : name);
685 :
686 : errcode = GetLastError();
687 : if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
688 : {
689 : /*
690 : * On Windows, when the segment already exists, a handle for the
691 : * existing segment is returned. We must close it before
692 : * returning. However, if the existing segment is created by a
693 : * service, then it returns ERROR_ACCESS_DENIED. We don't do
694 : * _dosmaperr here, so errno won't be modified.
695 : */
696 : if (hmap)
697 : CloseHandle(hmap);
698 : return false;
699 : }
700 :
701 : if (!hmap)
702 : {
703 : _dosmaperr(errcode);
704 : ereport(elevel,
705 : (errcode_for_dynamic_shared_memory(),
706 : errmsg("could not create shared memory segment \"%s\": %m",
707 : name)));
708 : return false;
709 : }
710 : }
711 : else
712 : {
713 : hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
714 : FALSE, /* do not inherit the name */
715 : name); /* name of mapping object */
716 : if (!hmap)
717 : {
718 : _dosmaperr(GetLastError());
719 : ereport(elevel,
720 : (errcode_for_dynamic_shared_memory(),
721 : errmsg("could not open shared memory segment \"%s\": %m",
722 : name)));
723 : return false;
724 : }
725 : }
726 :
727 : /* Map it. */
728 : address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
729 : 0, 0, 0);
730 : if (!address)
731 : {
732 : int save_errno;
733 :
734 : _dosmaperr(GetLastError());
735 : /* Back out what's already been done. */
736 : save_errno = errno;
737 : CloseHandle(hmap);
738 : errno = save_errno;
739 :
740 : ereport(elevel,
741 : (errcode_for_dynamic_shared_memory(),
742 : errmsg("could not map shared memory segment \"%s\": %m",
743 : name)));
744 : return false;
745 : }
746 :
747 : /*
748 : * VirtualQuery gives size in page_size units, which is 4K for Windows. We
749 : * need size only when we are attaching, but it's better to get the size
750 : * when creating new segment to keep size consistent both for
751 : * DSM_OP_CREATE and DSM_OP_ATTACH.
752 : */
753 : if (VirtualQuery(address, &info, sizeof(info)) == 0)
754 : {
755 : int save_errno;
756 :
757 : _dosmaperr(GetLastError());
758 : /* Back out what's already been done. */
759 : save_errno = errno;
760 : UnmapViewOfFile(address);
761 : CloseHandle(hmap);
762 : errno = save_errno;
763 :
764 : ereport(elevel,
765 : (errcode_for_dynamic_shared_memory(),
766 : errmsg("could not stat shared memory segment \"%s\": %m",
767 : name)));
768 : return false;
769 : }
770 :
771 : *mapped_address = address;
772 : *mapped_size = info.RegionSize;
773 : *impl_private = hmap;
774 :
775 : return true;
776 : }
777 : #endif
778 :
779 : #ifdef USE_DSM_MMAP
780 : /*
781 : * Operating system primitives to support mmap-based shared memory.
782 : *
3469 rhaas 783 EUB : * Calling this "shared memory" is somewhat of a misnomer, because what
784 : * we're really doing is creating a bunch of files and mapping them into
785 : * our address space. The operating system may feel obliged to
786 : * synchronize the contents to disk even if nothing is being paged out,
787 : * which will not serve us well. The user can relocate the pg_dynshmem
788 : * directory to a ramdisk to avoid this problem, if available.
789 : */
790 : static bool
3450 rhaas 791 UIC 0 : dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
3450 rhaas 792 EUB : void **impl_private, void **mapped_address, Size *mapped_size,
793 : int elevel)
794 : {
795 : char name[64];
3260 bruce 796 : int flags;
797 : int fd;
798 : char *address;
3469 rhaas 799 :
3469 rhaas 800 UIC 0 : snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
3260 bruce 801 EUB : handle);
802 :
803 : /* Handle teardown cases. */
3469 rhaas 804 UIC 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
3469 rhaas 805 EUB : {
3469 rhaas 806 UIC 0 : if (*mapped_address != NULL
3469 rhaas 807 UBC 0 : && munmap(*mapped_address, *mapped_size) != 0)
3469 rhaas 808 EUB : {
3469 rhaas 809 UBC 0 : ereport(elevel,
810 : (errcode_for_dynamic_shared_memory(),
2118 tgl 811 EUB : errmsg("could not unmap shared memory segment \"%s\": %m",
812 : name)));
3469 rhaas 813 UIC 0 : return false;
814 : }
3469 rhaas 815 UBC 0 : *mapped_address = NULL;
3469 rhaas 816 UIC 0 : *mapped_size = 0;
3469 rhaas 817 UBC 0 : if (op == DSM_OP_DESTROY && unlink(name) != 0)
818 : {
3469 rhaas 819 UIC 0 : ereport(elevel,
820 : (errcode_for_dynamic_shared_memory(),
2118 tgl 821 EUB : errmsg("could not remove shared memory segment \"%s\": %m",
822 : name)));
3469 rhaas 823 UIC 0 : return false;
3469 rhaas 824 EUB : }
3469 rhaas 825 UBC 0 : return true;
826 : }
827 :
828 : /* Create new segment or open an existing one for attach. */
829 0 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
2024 peter_e 830 UIC 0 : if ((fd = OpenTransientFile(name, flags)) == -1)
831 : {
282 tmunro 832 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
3469 rhaas 833 0 : ereport(elevel,
834 : (errcode_for_dynamic_shared_memory(),
835 : errmsg("could not open shared memory segment \"%s\": %m",
3260 bruce 836 EUB : name)));
3469 rhaas 837 UIC 0 : return false;
838 : }
839 :
3469 rhaas 840 EUB : /*
841 : * If we're attaching the segment, determine the current size; if we are
842 : * creating the segment, set the size to the requested value.
843 : */
3469 rhaas 844 UIC 0 : if (op == DSM_OP_ATTACH)
3469 rhaas 845 EUB : {
846 : struct stat st;
847 :
3469 rhaas 848 UIC 0 : if (fstat(fd, &st) != 0)
3469 rhaas 849 EUB : {
850 : int save_errno;
851 :
852 : /* Back out what's already been done. */
3469 rhaas 853 UBC 0 : save_errno = errno;
3469 rhaas 854 UIC 0 : CloseTransientFile(fd);
3469 rhaas 855 UBC 0 : errno = save_errno;
856 :
3469 rhaas 857 UIC 0 : ereport(elevel,
858 : (errcode_for_dynamic_shared_memory(),
859 : errmsg("could not stat shared memory segment \"%s\": %m",
860 : name)));
861 0 : return false;
862 : }
863 0 : request_size = st.st_size;
864 : }
865 : else
3469 rhaas 866 EUB : {
867 : /*
868 : * Allocate a buffer full of zeros.
869 : *
870 : * Note: palloc zbuffer, instead of just using a local char array, to
871 : * ensure it is reasonably well-aligned; this may save a few cycles
872 : * transferring data to the kernel.
873 : */
3260 bruce 874 UIC 0 : char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
875 0 : uint32 remaining = request_size;
3260 bruce 876 UBC 0 : bool success = true;
877 :
3469 rhaas 878 EUB : /*
879 : * Zero-fill the file. We have to do this the hard way to ensure that
3260 bruce 880 : * all the file space has really been allocated, so that we don't
881 : * later seg fault when accessing the memory mapping. This is pretty
882 : * pessimal.
3469 rhaas 883 : */
3469 rhaas 884 UBC 0 : while (success && remaining > 0)
885 : {
3260 bruce 886 0 : Size goal = remaining;
3469 rhaas 887 EUB :
3469 rhaas 888 UIC 0 : if (goal > ZBUFFER_SIZE)
889 0 : goal = ZBUFFER_SIZE;
2213 rhaas 890 UBC 0 : pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
3469 rhaas 891 UIC 0 : if (write(fd, zbuffer, goal) == goal)
892 0 : remaining -= goal;
893 : else
894 0 : success = false;
2213 rhaas 895 UBC 0 : pgstat_report_wait_end();
3469 rhaas 896 EUB : }
897 :
3469 rhaas 898 UBC 0 : if (!success)
899 : {
3260 bruce 900 EUB : int save_errno;
901 :
902 : /* Back out what's already been done. */
3469 rhaas 903 UIC 0 : save_errno = errno;
3469 rhaas 904 UBC 0 : CloseTransientFile(fd);
1615 tmunro 905 UIC 0 : unlink(name);
3469 rhaas 906 0 : errno = save_errno ? save_errno : ENOSPC;
907 :
908 0 : ereport(elevel,
3469 rhaas 909 EUB : (errcode_for_dynamic_shared_memory(),
910 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
3260 bruce 911 : name, request_size)));
3469 rhaas 912 UIC 0 : return false;
913 : }
914 : }
915 :
3469 rhaas 916 EUB : /* Map it. */
3260 bruce 917 UBC 0 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
2941 bruce 918 EUB : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
3469 rhaas 919 UBC 0 : if (address == MAP_FAILED)
3469 rhaas 920 EUB : {
921 : int save_errno;
922 :
923 : /* Back out what's already been done. */
3469 rhaas 924 UIC 0 : save_errno = errno;
925 0 : CloseTransientFile(fd);
3469 rhaas 926 UBC 0 : if (op == DSM_OP_CREATE)
3469 rhaas 927 UIC 0 : unlink(name);
3469 rhaas 928 UBC 0 : errno = save_errno;
3469 rhaas 929 EUB :
3469 rhaas 930 UIC 0 : ereport(elevel,
3469 rhaas 931 EUB : (errcode_for_dynamic_shared_memory(),
932 : errmsg("could not map shared memory segment \"%s\": %m",
933 : name)));
3469 rhaas 934 UIC 0 : return false;
935 : }
936 0 : *mapped_address = address;
3469 rhaas 937 UBC 0 : *mapped_size = request_size;
938 :
1373 peter 939 UIC 0 : if (CloseTransientFile(fd) != 0)
1492 michael 940 EUB : {
1492 michael 941 UIC 0 : ereport(elevel,
942 : (errcode_for_file_access(),
943 : errmsg("could not close shared memory segment \"%s\": %m",
944 : name)));
945 0 : return false;
946 : }
947 :
3469 rhaas 948 0 : return true;
949 : }
950 : #endif
951 :
952 : /*
953 : * Implementation-specific actions that must be performed when a segment is to
2420 rhaas 954 ECB : * be preserved even when no backend has it attached.
955 : *
956 : * Except on Windows, we don't need to do anything at all. But since Windows
3317 957 : * cleans up segments automatically when no references remain, we duplicate
958 : * the segment handle into the postmaster process. The postmaster needn't
959 : * do anything to receive the handle; Windows transfers it automatically.
960 : */
961 : void
2420 rhaas 962 GIC 1350 : dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
963 : void **impl_private_pm_handle)
964 : {
3317 965 1350 : switch (dynamic_shared_memory_type)
966 : {
967 : #ifdef USE_DSM_WINDOWS
968 : case DSM_IMPL_WINDOWS:
969 : if (IsUnderPostmaster)
970 : {
971 : HANDLE hmap;
972 :
973 : if (!DuplicateHandle(GetCurrentProcess(), impl_private,
974 : PostmasterHandle, &hmap, 0, FALSE,
975 : DUPLICATE_SAME_ACCESS))
976 : {
977 : char name[64];
978 :
979 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
980 : _dosmaperr(GetLastError());
981 : ereport(ERROR,
982 : (errcode_for_dynamic_shared_memory(),
983 : errmsg("could not duplicate handle for \"%s\": %m",
984 : name)));
985 : }
986 :
987 : /*
988 : * Here, we remember the handle that we created in the
989 : * postmaster process. This handle isn't actually usable in
990 : * any process other than the postmaster, but that doesn't
2420 rhaas 991 ECB : * matter. We're just holding onto it so that, if the segment
992 : * is unpinned, dsm_impl_unpin_segment can close it.
993 : */
994 : *impl_private_pm_handle = hmap;
995 : }
996 : break;
997 : #endif
998 : default:
2420 rhaas 999 GIC 1350 : break;
1000 : }
1001 1350 : }
1002 :
1003 : /*
1004 : * Implementation-specific actions that must be performed when a segment is no
2420 rhaas 1005 ECB : * longer to be preserved, so that it will be cleaned up when all backends
1006 : * have detached from it.
1007 : *
1008 : * Except on Windows, we don't need to do anything at all. For Windows, we
1009 : * close the extra handle that dsm_impl_pin_segment created in the
1010 : * postmaster's process space.
1011 : */
1012 : void
2420 rhaas 1013 GIC 114 : dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1014 : {
1015 114 : switch (dynamic_shared_memory_type)
1016 : {
1017 : #ifdef USE_DSM_WINDOWS
1018 : case DSM_IMPL_WINDOWS:
1019 : if (IsUnderPostmaster)
1020 : {
1021 : if (*impl_private &&
1022 : !DuplicateHandle(PostmasterHandle, *impl_private,
1023 : NULL, NULL, 0, FALSE,
1024 : DUPLICATE_CLOSE_SOURCE))
1025 : {
1026 : char name[64];
1027 :
1028 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1029 : _dosmaperr(GetLastError());
1030 : ereport(ERROR,
1031 : (errcode_for_dynamic_shared_memory(),
1032 : errmsg("could not duplicate handle for \"%s\": %m",
2118 tgl 1033 ECB : name)));
1034 : }
2420 rhaas 1035 :
1036 : *impl_private = NULL;
1037 : }
368 andres 1038 EUB : break;
1039 : #endif
3317 rhaas 1040 : default:
3317 rhaas 1041 GBC 114 : break;
1042 : }
1043 114 : }
1044 :
1045 : static int
2794 andres 1046 UIC 0 : errcode_for_dynamic_shared_memory(void)
1047 : {
3469 rhaas 1048 0 : if (errno == EFBIG || errno == ENOMEM)
1110 tgl 1049 0 : return errcode(ERRCODE_OUT_OF_MEMORY);
1050 : else
1051 0 : return errcode_for_file_access();
1052 : }
|