Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * partdesc.c
4 : : * Support routines for manipulating partition descriptors
5 : : *
6 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : * IDENTIFICATION
10 : : * src/backend/partitioning/partdesc.c
11 : : *
12 : : *-------------------------------------------------------------------------
13 : : */
14 : :
15 : : #include "postgres.h"
16 : :
17 : : #include "access/genam.h"
18 : : #include "access/htup_details.h"
19 : : #include "access/table.h"
20 : : #include "catalog/partition.h"
21 : : #include "catalog/pg_inherits.h"
22 : : #include "partitioning/partbounds.h"
23 : : #include "partitioning/partdesc.h"
24 : : #include "utils/builtins.h"
25 : : #include "utils/fmgroids.h"
26 : : #include "utils/hsearch.h"
27 : : #include "utils/lsyscache.h"
28 : : #include "utils/memutils.h"
29 : : #include "utils/partcache.h"
30 : : #include "utils/rel.h"
31 : : #include "utils/snapmgr.h"
32 : : #include "utils/syscache.h"
33 : :
34 : : typedef struct PartitionDirectoryData
35 : : {
36 : : MemoryContext pdir_mcxt;
37 : : HTAB *pdir_hash;
38 : : bool omit_detached;
39 : : } PartitionDirectoryData;
40 : :
41 : : typedef struct PartitionDirectoryEntry
42 : : {
43 : : Oid reloid;
44 : : Relation rel;
45 : : PartitionDesc pd;
46 : : } PartitionDirectoryEntry;
47 : :
48 : : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
49 : : bool omit_detached);
50 : :
51 : :
52 : : /*
53 : : * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
54 : : *
55 : : * We keep two partdescs in relcache: rd_partdesc includes all partitions
56 : : * (even those being concurrently marked detached), while rd_partdesc_nodetached
57 : : * omits (some of) those. We store the pg_inherits.xmin value for the latter,
58 : : * to determine whether it can be validly reused in each case, since that
59 : : * depends on the active snapshot.
60 : : *
61 : : * Note: we arrange for partition descriptors to not get freed until the
62 : : * relcache entry's refcount goes to zero (see hacks in RelationClose,
63 : : * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
64 : : * though we hand back a direct pointer into the relcache entry, it's safe
65 : : * for callers to continue to use that pointer as long as (a) they hold the
66 : : * relation open, and (b) they hold a relation lock strong enough to ensure
67 : : * that the data doesn't become stale.
68 : : */
69 : : PartitionDesc
1088 alvherre@alvh.no-ip. 70 :CBC 31724 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
71 : : {
72 [ - + ]: 31724 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
73 : :
74 : : /*
75 : : * If relcache has a partition descriptor, use that. However, we can only
76 : : * do so when we are asked to include all partitions including detached;
77 : : * and also when we know that there are no detached partitions.
78 : : *
79 : : * If there is no active snapshot, detached partitions aren't omitted
80 : : * either, so we can use the cached descriptor too in that case.
81 : : */
82 [ + + + + : 31724 : if (likely(rel->rd_partdesc &&
+ + - + +
+ ]
83 : : (!rel->rd_partdesc->detached_exist || !omit_detached ||
84 : : !ActiveSnapshotSet())))
85 : 20589 : return rel->rd_partdesc;
86 : :
87 : : /*
88 : : * If we're asked to omit detached partitions, we may be able to use a
89 : : * cached descriptor too. We determine that based on the pg_inherits.xmin
90 : : * that was saved alongside that descriptor: if the xmin that was not in
91 : : * progress for that active snapshot is also not in progress for the
92 : : * current active snapshot, then we can use it. Otherwise build one from
93 : : * scratch.
94 : : */
1082 95 [ + + ]: 11135 : if (omit_detached &&
96 [ + + + - ]: 10846 : rel->rd_partdesc_nodetached &&
97 : 7 : ActiveSnapshotSet())
98 : : {
99 : : Snapshot activesnap;
100 : :
1074 101 [ - + ]: 7 : Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
1082 102 : 7 : activesnap = GetActiveSnapshot();
103 : :
104 [ + - ]: 7 : if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
105 : 7 : return rel->rd_partdesc_nodetached;
106 : : }
107 : :
1088 108 : 11128 : return RelationBuildPartitionDesc(rel, omit_detached);
109 : : }
110 : :
111 : : /*
112 : : * RelationBuildPartitionDesc
113 : : * Form rel's partition descriptor, and store in relcache entry
114 : : *
115 : : * Partition descriptor is a complex structure; to avoid complicated logic to
116 : : * free individual elements whenever the relcache entry is flushed, we give it
117 : : * its own memory context, a child of CacheMemoryContext, which can easily be
118 : : * deleted on its own. To avoid leaking memory in that context in case of an
119 : : * error partway through this function, the context is initially created as a
120 : : * child of CurTransactionContext and only re-parented to CacheMemoryContext
121 : : * at the end, when no further errors are possible. Also, we don't make this
122 : : * context the current context except in very brief code sections, out of fear
123 : : * that some of our callees allocate memory on their own which would be leaked
124 : : * permanently.
125 : : *
126 : : * As a special case, partition descriptors that are requested to omit
127 : : * partitions being detached (and which contain such partitions) are transient
128 : : * and are not associated with the relcache entry. Such descriptors only last
129 : : * through the requesting Portal, so we use the corresponding memory context
130 : : * for them.
131 : : */
132 : : static PartitionDesc
133 : 11128 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
134 : : {
135 : : PartitionDesc partdesc;
1879 rhaas@postgresql.org 136 : 11128 : PartitionBoundInfo boundinfo = NULL;
137 : : List *inhoids;
138 : 11128 : PartitionBoundSpec **boundspecs = NULL;
139 : 11128 : Oid *oids = NULL;
1572 tgl@sss.pgh.pa.us 140 : 11128 : bool *is_leaf = NULL;
141 : : bool detached_exist;
142 : : bool is_omit;
143 : : TransactionId detached_xmin;
144 : : ListCell *cell;
145 : : int i,
146 : : nparts;
1879 rhaas@postgresql.org 147 : 11128 : PartitionKey key = RelationGetPartitionKey(rel);
148 : : MemoryContext new_pdcxt;
149 : : MemoryContext oldcxt;
150 : : int *mapping;
151 : :
152 : : /*
153 : : * Get partition oids from pg_inherits. This uses a single snapshot to
154 : : * fetch the list of children, so while more children may be getting added
155 : : * concurrently, whatever this function returns will be accurate as of
156 : : * some well-defined point in time.
157 : : */
1088 alvherre@alvh.no-ip. 158 : 11128 : detached_exist = false;
1082 159 : 11128 : detached_xmin = InvalidTransactionId;
160 : 11128 : inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
161 : : omit_detached, NoLock,
162 : : &detached_exist,
163 : : &detached_xmin);
164 : :
1879 rhaas@postgresql.org 165 : 11128 : nparts = list_length(inhoids);
166 : :
167 : : /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
168 [ + + ]: 11128 : if (nparts > 0)
169 : : {
1572 tgl@sss.pgh.pa.us 170 : 8264 : oids = (Oid *) palloc(nparts * sizeof(Oid));
171 : 8264 : is_leaf = (bool *) palloc(nparts * sizeof(bool));
1879 rhaas@postgresql.org 172 : 8264 : boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
173 : : }
174 : :
175 : : /* Collect bound spec nodes for each partition. */
176 : 11128 : i = 0;
177 [ + + + + : 28321 : foreach(cell, inhoids)
+ + ]
178 : : {
179 : 17193 : Oid inhrelid = lfirst_oid(cell);
180 : : HeapTuple tuple;
1865 181 : 17193 : PartitionBoundSpec *boundspec = NULL;
182 : :
183 : : /* Try fetching the tuple from the catcache, for speed. */
269 michael@paquier.xyz 184 :GNC 17193 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
1865 rhaas@postgresql.org 185 [ + - ]:CBC 17193 : if (HeapTupleIsValid(tuple))
186 : : {
187 : : Datum datum;
188 : : bool isnull;
189 : :
190 : 17193 : datum = SysCacheGetAttr(RELOID, tuple,
191 : : Anum_pg_class_relpartbound,
192 : : &isnull);
193 [ + - ]: 17193 : if (!isnull)
194 : 17193 : boundspec = stringToNode(TextDatumGetCString(datum));
195 : 17193 : ReleaseSysCache(tuple);
196 : : }
197 : :
198 : : /*
199 : : * The system cache may be out of date; if so, we may find no pg_class
200 : : * tuple or an old one where relpartbound is NULL. In that case, try
201 : : * the table directly. We can't just AcceptInvalidationMessages() and
202 : : * retry the system cache lookup because it's possible that a
203 : : * concurrent ATTACH PARTITION operation has removed itself from the
204 : : * ProcArray but not yet added invalidation messages to the shared
205 : : * queue; InvalidateSystemCaches() would work, but seems excessive.
206 : : *
207 : : * Note that this algorithm assumes that PartitionBoundSpec we manage
208 : : * to fetch is the right one -- so this is only good enough for
209 : : * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
210 : : * some hypothetical operation that changes the partition bounds.
211 : : */
212 [ - + ]: 17193 : if (boundspec == NULL)
213 : : {
214 : : Relation pg_class;
215 : : SysScanDesc scan;
216 : : ScanKeyData key[1];
217 : : Datum datum;
218 : : bool isnull;
219 : :
1865 rhaas@postgresql.org 220 :UBC 0 : pg_class = table_open(RelationRelationId, AccessShareLock);
221 : 0 : ScanKeyInit(&key[0],
222 : : Anum_pg_class_oid,
223 : : BTEqualStrategyNumber, F_OIDEQ,
224 : : ObjectIdGetDatum(inhrelid));
225 : 0 : scan = systable_beginscan(pg_class, ClassOidIndexId, true,
226 : : NULL, 1, key);
227 : 0 : tuple = systable_getnext(scan);
228 : 0 : datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
229 : : RelationGetDescr(pg_class), &isnull);
230 [ # # ]: 0 : if (!isnull)
231 : 0 : boundspec = stringToNode(TextDatumGetCString(datum));
232 : 0 : systable_endscan(scan);
233 : 0 : table_close(pg_class, AccessShareLock);
234 : : }
235 : :
236 : : /* Sanity checks. */
1865 rhaas@postgresql.org 237 [ - + ]:CBC 17193 : if (!boundspec)
1865 rhaas@postgresql.org 238 [ # # ]:UBC 0 : elog(ERROR, "missing relpartbound for relation %u", inhrelid);
1879 rhaas@postgresql.org 239 [ - + ]:CBC 17193 : if (!IsA(boundspec, PartitionBoundSpec))
1879 rhaas@postgresql.org 240 [ # # ]:UBC 0 : elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
241 : :
242 : : /*
243 : : * If the PartitionBoundSpec says this is the default partition, its
244 : : * OID should match pg_partitioned_table.partdefid; if not, the
245 : : * catalog is corrupt.
246 : : */
1879 rhaas@postgresql.org 247 [ + + ]:CBC 17193 : if (boundspec->is_default)
248 : : {
249 : : Oid partdefid;
250 : :
251 : 1097 : partdefid = get_default_partition_oid(RelationGetRelid(rel));
252 [ - + ]: 1097 : if (partdefid != inhrelid)
1879 rhaas@postgresql.org 253 [ # # ]:UBC 0 : elog(ERROR, "expected partdefid %u, but got %u",
254 : : inhrelid, partdefid);
255 : : }
256 : :
257 : : /* Save results. */
1879 rhaas@postgresql.org 258 :CBC 17193 : oids[i] = inhrelid;
1572 tgl@sss.pgh.pa.us 259 : 17193 : is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
1879 rhaas@postgresql.org 260 : 17193 : boundspecs[i] = boundspec;
261 : 17193 : ++i;
262 : : }
263 : :
264 : : /*
265 : : * Create PartitionBoundInfo and mapping, working in the caller's context.
266 : : * This could fail, but we haven't done any damage if so.
267 : : */
1572 tgl@sss.pgh.pa.us 268 [ + + ]: 11128 : if (nparts > 0)
269 : 8264 : boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
270 : :
271 : : /*
272 : : * Now build the actual relcache partition descriptor, copying all the
273 : : * data into a new, small context. As per above comment, we don't make
274 : : * this a long-lived context until it's finished.
275 : : */
276 : 11128 : new_pdcxt = AllocSetContextCreate(CurTransactionContext,
277 : : "partition descriptor",
278 : : ALLOCSET_SMALL_SIZES);
279 : 11128 : MemoryContextCopyAndSetIdentifier(new_pdcxt,
280 : : RelationGetRelationName(rel));
281 : :
282 : : partdesc = (PartitionDescData *)
283 : 11128 : MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
1879 rhaas@postgresql.org 284 : 11128 : partdesc->nparts = nparts;
1088 alvherre@alvh.no-ip. 285 : 11128 : partdesc->detached_exist = detached_exist;
286 : : /* If there are no partitions, the rest of the partdesc can stay zero */
1858 tgl@sss.pgh.pa.us 287 [ + + ]: 11128 : if (nparts > 0)
288 : : {
1572 289 : 8264 : oldcxt = MemoryContextSwitchTo(new_pdcxt);
1858 290 : 8264 : partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
291 : :
292 : : /* Initialize caching fields for speeding up ExecFindPartition */
621 drowley@postgresql.o 293 : 8264 : partdesc->last_found_datum_index = -1;
294 : 8264 : partdesc->last_found_part_index = -1;
295 : 8264 : partdesc->last_found_count = 0;
296 : :
1858 tgl@sss.pgh.pa.us 297 : 8264 : partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
298 : 8264 : partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
299 : :
300 : : /*
301 : : * Assign OIDs from the original array into mapped indexes of the
302 : : * result array. The order of OIDs in the former is defined by the
303 : : * catalog scan that retrieved them, whereas that in the latter is
304 : : * defined by canonicalized representation of the partition bounds.
305 : : * Also save leaf-ness of each partition.
306 : : */
307 [ + + ]: 25457 : for (i = 0; i < nparts; i++)
308 : : {
309 : 17193 : int index = mapping[i];
310 : :
311 : 17193 : partdesc->oids[index] = oids[i];
1572 312 : 17193 : partdesc->is_leaf[index] = is_leaf[i];
313 : : }
314 : 8264 : MemoryContextSwitchTo(oldcxt);
315 : : }
316 : :
317 : : /*
318 : : * Are we working with the partdesc that omits the detached partition, or
319 : : * the one that includes it?
320 : : *
321 : : * Note that if a partition was found by the catalog's scan to have been
322 : : * detached, but the pg_inherit tuple saying so was not visible to the
323 : : * active snapshot (find_inheritance_children_extended will not have set
324 : : * detached_xmin in that case), we consider there to be no "omittable"
325 : : * detached partitions.
326 : : */
1074 alvherre@alvh.no-ip. 327 [ + + + + : 11173 : is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
+ - ]
328 [ + + ]: 45 : TransactionIdIsValid(detached_xmin);
329 : :
330 : : /*
331 : : * We have a fully valid partdesc. Reparent it so that it has the right
332 : : * lifespan.
333 : : */
1082 334 : 11128 : MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
335 : :
336 : : /*
337 : : * Store it into relcache.
338 : : *
339 : : * But first, a kluge: if there's an old context for this type of
340 : : * descriptor, it contains an old partition descriptor that may still be
341 : : * referenced somewhere. Preserve it, while not leaking it, by
342 : : * reattaching it as a child context of the new one. Eventually it will
343 : : * get dropped by either RelationClose or RelationClearRelation. (We keep
344 : : * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
345 : : * detached-partitions in rd_pddcxt.)
346 : : */
347 [ + + ]: 11128 : if (is_omit)
348 : : {
349 [ - + ]: 33 : if (rel->rd_pddcxt != NULL)
1082 alvherre@alvh.no-ip. 350 :UBC 0 : MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
1082 alvherre@alvh.no-ip. 351 :CBC 33 : rel->rd_pddcxt = new_pdcxt;
352 : 33 : rel->rd_partdesc_nodetached = partdesc;
353 : :
354 : : /*
355 : : * For partdescs built excluding detached partitions, which we save
356 : : * separately, we also record the pg_inherits.xmin of the detached
357 : : * partition that was omitted; this informs a future potential user of
358 : : * such a cached partdesc to only use it after cross-checking that the
359 : : * xmin is indeed visible to the snapshot it is going to be working
360 : : * with.
361 : : */
1074 362 [ - + ]: 33 : Assert(TransactionIdIsValid(detached_xmin));
1082 363 : 33 : rel->rd_partdesc_nodetached_xmin = detached_xmin;
364 : : }
365 : : else
366 : : {
1088 367 [ + + ]: 11095 : if (rel->rd_pdcxt != NULL)
368 : 2597 : MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
369 : 11095 : rel->rd_pdcxt = new_pdcxt;
370 : 11095 : rel->rd_partdesc = partdesc;
371 : : }
372 : :
373 : 11128 : return partdesc;
374 : : }
375 : :
376 : : /*
377 : : * CreatePartitionDirectory
378 : : * Create a new partition directory object.
379 : : */
380 : : PartitionDirectory
381 : 9940 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
382 : : {
1865 rhaas@postgresql.org 383 : 9940 : MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
384 : : PartitionDirectory pdir;
385 : : HASHCTL ctl;
386 : :
1216 tgl@sss.pgh.pa.us 387 : 9940 : pdir = palloc(sizeof(PartitionDirectoryData));
388 : 9940 : pdir->pdir_mcxt = mcxt;
389 : :
1865 rhaas@postgresql.org 390 : 9940 : ctl.keysize = sizeof(Oid);
391 : 9940 : ctl.entrysize = sizeof(PartitionDirectoryEntry);
392 : 9940 : ctl.hcxt = mcxt;
393 : :
394 : 9940 : pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
395 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1088 alvherre@alvh.no-ip. 396 : 9940 : pdir->omit_detached = omit_detached;
397 : :
1865 rhaas@postgresql.org 398 : 9940 : MemoryContextSwitchTo(oldcontext);
399 : 9940 : return pdir;
400 : : }
401 : :
402 : : /*
403 : : * PartitionDirectoryLookup
404 : : * Look up the partition descriptor for a relation in the directory.
405 : : *
406 : : * The purpose of this function is to ensure that we get the same
407 : : * PartitionDesc for each relation every time we look it up. In the
408 : : * face of concurrent DDL, different PartitionDescs may be constructed with
409 : : * different views of the catalog state, but any single particular OID
410 : : * will always get the same PartitionDesc for as long as the same
411 : : * PartitionDirectory is used.
412 : : */
413 : : PartitionDesc
414 : 21566 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
415 : : {
416 : : PartitionDirectoryEntry *pde;
417 : 21566 : Oid relid = RelationGetRelid(rel);
418 : : bool found;
419 : :
420 : 21566 : pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
421 [ + + ]: 21566 : if (!found)
422 : : {
423 : : /*
424 : : * We must keep a reference count on the relation so that the
425 : : * PartitionDesc to which we are pointing can't get destroyed.
426 : : */
427 : 13237 : RelationIncrementReferenceCount(rel);
428 : 13237 : pde->rel = rel;
1088 alvherre@alvh.no-ip. 429 : 13237 : pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
1865 rhaas@postgresql.org 430 [ - + ]: 13237 : Assert(pde->pd != NULL);
431 : : }
432 : 21566 : return pde->pd;
433 : : }
434 : :
435 : : /*
436 : : * DestroyPartitionDirectory
437 : : * Destroy a partition directory.
438 : : *
439 : : * Release the reference counts we're holding.
440 : : */
441 : : void
442 : 9578 : DestroyPartitionDirectory(PartitionDirectory pdir)
443 : : {
444 : : HASH_SEQ_STATUS status;
445 : : PartitionDirectoryEntry *pde;
446 : :
447 : 9578 : hash_seq_init(&status, pdir->pdir_hash);
448 [ + + ]: 22344 : while ((pde = hash_seq_search(&status)) != NULL)
449 : 12766 : RelationDecrementReferenceCount(pde->rel);
450 : 9578 : }
451 : :
452 : : /*
453 : : * get_default_oid_from_partdesc
454 : : *
455 : : * Given a partition descriptor, return the OID of the default partition, if
456 : : * one exists; else, return InvalidOid.
457 : : */
458 : : Oid
1879 459 : 10819 : get_default_oid_from_partdesc(PartitionDesc partdesc)
460 : : {
461 [ + - + + ]: 10819 : if (partdesc && partdesc->boundinfo &&
462 [ + + ]: 6736 : partition_bound_has_default(partdesc->boundinfo))
463 : 955 : return partdesc->oids[partdesc->boundinfo->default_index];
464 : :
465 : 9864 : return InvalidOid;
466 : : }
|