Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * partdesc.c
4 : * Support routines for manipulating partition descriptors
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/partitioning/partdesc.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/genam.h"
18 : #include "access/htup_details.h"
19 : #include "access/table.h"
20 : #include "catalog/partition.h"
21 : #include "catalog/pg_inherits.h"
22 : #include "partitioning/partbounds.h"
23 : #include "partitioning/partdesc.h"
24 : #include "storage/bufmgr.h"
25 : #include "storage/sinval.h"
26 : #include "utils/builtins.h"
27 : #include "utils/fmgroids.h"
28 : #include "utils/hsearch.h"
29 : #include "utils/inval.h"
30 : #include "utils/lsyscache.h"
31 : #include "utils/memutils.h"
32 : #include "utils/partcache.h"
33 : #include "utils/rel.h"
34 : #include "utils/syscache.h"
35 :
36 : typedef struct PartitionDirectoryData
37 : {
38 : MemoryContext pdir_mcxt;
39 : HTAB *pdir_hash;
40 : bool omit_detached;
41 : } PartitionDirectoryData;
42 :
43 : typedef struct PartitionDirectoryEntry
44 : {
45 : Oid reloid;
46 : Relation rel;
47 : PartitionDesc pd;
48 : } PartitionDirectoryEntry;
49 :
50 : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
51 : bool omit_detached);
52 :
53 :
54 : /*
55 : * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
56 : *
57 : * We keep two partdescs in relcache: rd_partdesc includes all partitions
58 : * (even those being concurrently marked detached), while rd_partdesc_nodetach
59 : * omits (some of) those. We store the pg_inherits.xmin value for the latter,
60 : * to determine whether it can be validly reused in each case, since that
61 : * depends on the active snapshot.
62 : *
63 : * Note: we arrange for partition descriptors to not get freed until the
64 : * relcache entry's refcount goes to zero (see hacks in RelationClose,
65 : * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
66 : * though we hand back a direct pointer into the relcache entry, it's safe
67 : * for callers to continue to use that pointer as long as (a) they hold the
68 : * relation open, and (b) they hold a relation lock strong enough to ensure
69 : * that the data doesn't become stale.
70 : */
71 : PartitionDesc
717 alvherre 72 CBC 27537 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
73 : {
74 27537 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
75 :
76 : /*
77 : * If relcache has a partition descriptor, use that. However, we can only
78 : * do so when we are asked to include all partitions including detached;
79 : * and also when we know that there are no detached partitions.
80 : *
81 : * If there is no active snapshot, detached partitions aren't omitted
82 : * either, so we can use the cached descriptor too in that case.
83 : */
84 27537 : if (likely(rel->rd_partdesc &&
85 : (!rel->rd_partdesc->detached_exist || !omit_detached ||
86 : !ActiveSnapshotSet())))
87 17702 : return rel->rd_partdesc;
88 :
89 : /*
90 : * If we're asked to omit detached partitions, we may be able to use a
91 : * cached descriptor too. We determine that based on the pg_inherits.xmin
92 : * that was saved alongside that descriptor: if the xmin that was not in
93 : * progress for that active snapshot is also not in progress for the
94 : * current active snapshot, then we can use it. Otherwise build one from
95 : * scratch.
96 : */
711 97 9835 : if (omit_detached &&
98 9557 : rel->rd_partdesc_nodetached &&
99 7 : ActiveSnapshotSet())
100 : {
101 : Snapshot activesnap;
102 :
703 103 7 : Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
711 104 7 : activesnap = GetActiveSnapshot();
105 :
106 7 : if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
107 7 : return rel->rd_partdesc_nodetached;
108 : }
109 :
717 110 9828 : return RelationBuildPartitionDesc(rel, omit_detached);
111 : }
112 :
113 : /*
114 : * RelationBuildPartitionDesc
115 : * Form rel's partition descriptor, and store in relcache entry
116 : *
117 : * Partition descriptor is a complex structure; to avoid complicated logic to
118 : * free individual elements whenever the relcache entry is flushed, we give it
119 : * its own memory context, a child of CacheMemoryContext, which can easily be
120 : * deleted on its own. To avoid leaking memory in that context in case of an
121 : * error partway through this function, the context is initially created as a
122 : * child of CurTransactionContext and only re-parented to CacheMemoryContext
123 : * at the end, when no further errors are possible. Also, we don't make this
124 : * context the current context except in very brief code sections, out of fear
125 : * that some of our callees allocate memory on their own which would be leaked
126 : * permanently.
127 : *
128 : * As a special case, partition descriptors that are requested to omit
129 : * partitions being detached (and which contain such partitions) are transient
130 : * and are not associated with the relcache entry. Such descriptors only last
131 : * through the requesting Portal, so we use the corresponding memory context
132 : * for them.
133 : */
134 : static PartitionDesc
135 9828 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
136 : {
137 : PartitionDesc partdesc;
1508 rhaas 138 9828 : PartitionBoundInfo boundinfo = NULL;
139 : List *inhoids;
140 9828 : PartitionBoundSpec **boundspecs = NULL;
141 9828 : Oid *oids = NULL;
1201 tgl 142 9828 : bool *is_leaf = NULL;
143 : bool detached_exist;
144 : bool is_omit;
145 : TransactionId detached_xmin;
146 : ListCell *cell;
147 : int i,
148 : nparts;
1508 rhaas 149 9828 : PartitionKey key = RelationGetPartitionKey(rel);
150 : MemoryContext new_pdcxt;
151 : MemoryContext oldcxt;
152 : int *mapping;
153 :
154 : /*
155 : * Get partition oids from pg_inherits. This uses a single snapshot to
156 : * fetch the list of children, so while more children may be getting added
157 : * concurrently, whatever this function returns will be accurate as of
158 : * some well-defined point in time.
159 : */
717 alvherre 160 9828 : detached_exist = false;
711 161 9828 : detached_xmin = InvalidTransactionId;
162 9828 : inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
163 : omit_detached, NoLock,
164 : &detached_exist,
165 : &detached_xmin);
166 :
1508 rhaas 167 9828 : nparts = list_length(inhoids);
168 :
169 : /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
170 9828 : if (nparts > 0)
171 : {
1201 tgl 172 7269 : oids = (Oid *) palloc(nparts * sizeof(Oid));
173 7269 : is_leaf = (bool *) palloc(nparts * sizeof(bool));
1508 rhaas 174 7269 : boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
175 : }
176 :
177 : /* Collect bound spec nodes for each partition. */
178 9828 : i = 0;
179 24377 : foreach(cell, inhoids)
180 : {
181 14549 : Oid inhrelid = lfirst_oid(cell);
182 : HeapTuple tuple;
1494 183 14549 : PartitionBoundSpec *boundspec = NULL;
184 :
185 : /* Try fetching the tuple from the catcache, for speed. */
1508 186 14549 : tuple = SearchSysCache1(RELOID, inhrelid);
1494 187 14549 : if (HeapTupleIsValid(tuple))
188 : {
189 : Datum datum;
190 : bool isnull;
191 :
192 14549 : datum = SysCacheGetAttr(RELOID, tuple,
193 : Anum_pg_class_relpartbound,
194 : &isnull);
195 14549 : if (!isnull)
196 14549 : boundspec = stringToNode(TextDatumGetCString(datum));
197 14549 : ReleaseSysCache(tuple);
198 : }
199 :
200 : /*
201 : * The system cache may be out of date; if so, we may find no pg_class
202 : * tuple or an old one where relpartbound is NULL. In that case, try
203 : * the table directly. We can't just AcceptInvalidationMessages() and
204 : * retry the system cache lookup because it's possible that a
205 : * concurrent ATTACH PARTITION operation has removed itself from the
206 : * ProcArray but not yet added invalidation messages to the shared
207 : * queue; InvalidateSystemCaches() would work, but seems excessive.
208 : *
209 : * Note that this algorithm assumes that PartitionBoundSpec we manage
210 : * to fetch is the right one -- so this is only good enough for
211 : * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
212 : * some hypothetical operation that changes the partition bounds.
213 : */
214 14549 : if (boundspec == NULL)
215 : {
216 : Relation pg_class;
217 : SysScanDesc scan;
218 : ScanKeyData key[1];
219 : Datum datum;
220 : bool isnull;
221 :
1494 rhaas 222 UBC 0 : pg_class = table_open(RelationRelationId, AccessShareLock);
223 0 : ScanKeyInit(&key[0],
224 : Anum_pg_class_oid,
225 : BTEqualStrategyNumber, F_OIDEQ,
226 : ObjectIdGetDatum(inhrelid));
227 0 : scan = systable_beginscan(pg_class, ClassOidIndexId, true,
228 : NULL, 1, key);
229 0 : tuple = systable_getnext(scan);
230 0 : datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
231 : RelationGetDescr(pg_class), &isnull);
232 0 : if (!isnull)
233 0 : boundspec = stringToNode(TextDatumGetCString(datum));
234 0 : systable_endscan(scan);
235 0 : table_close(pg_class, AccessShareLock);
236 : }
237 :
238 : /* Sanity checks. */
1494 rhaas 239 CBC 14549 : if (!boundspec)
1494 rhaas 240 UBC 0 : elog(ERROR, "missing relpartbound for relation %u", inhrelid);
1508 rhaas 241 CBC 14549 : if (!IsA(boundspec, PartitionBoundSpec))
1508 rhaas 242 UBC 0 : elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
243 :
244 : /*
245 : * If the PartitionBoundSpec says this is the default partition, its
246 : * OID should match pg_partitioned_table.partdefid; if not, the
247 : * catalog is corrupt.
248 : */
1508 rhaas 249 CBC 14549 : if (boundspec->is_default)
250 : {
251 : Oid partdefid;
252 :
253 800 : partdefid = get_default_partition_oid(RelationGetRelid(rel));
254 800 : if (partdefid != inhrelid)
1508 rhaas 255 UBC 0 : elog(ERROR, "expected partdefid %u, but got %u",
256 : inhrelid, partdefid);
257 : }
258 :
259 : /* Save results. */
1508 rhaas 260 CBC 14549 : oids[i] = inhrelid;
1201 tgl 261 14549 : is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
1508 rhaas 262 14549 : boundspecs[i] = boundspec;
263 14549 : ++i;
264 : }
265 :
266 : /*
267 : * Create PartitionBoundInfo and mapping, working in the caller's context.
268 : * This could fail, but we haven't done any damage if so.
269 : */
1201 tgl 270 9828 : if (nparts > 0)
271 7269 : boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
272 :
273 : /*
274 : * Now build the actual relcache partition descriptor, copying all the
275 : * data into a new, small context. As per above comment, we don't make
276 : * this a long-lived context until it's finished.
277 : */
278 9828 : new_pdcxt = AllocSetContextCreate(CurTransactionContext,
279 : "partition descriptor",
280 : ALLOCSET_SMALL_SIZES);
281 9828 : MemoryContextCopyAndSetIdentifier(new_pdcxt,
282 : RelationGetRelationName(rel));
283 :
284 : partdesc = (PartitionDescData *)
285 9828 : MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
1508 rhaas 286 9828 : partdesc->nparts = nparts;
717 alvherre 287 9828 : partdesc->detached_exist = detached_exist;
288 : /* If there are no partitions, the rest of the partdesc can stay zero */
1487 tgl 289 9828 : if (nparts > 0)
290 : {
1201 291 7269 : oldcxt = MemoryContextSwitchTo(new_pdcxt);
1487 292 7269 : partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
293 :
294 : /* Initialize caching fields for speeding up ExecFindPartition */
250 drowley 295 GNC 7269 : partdesc->last_found_datum_index = -1;
296 7269 : partdesc->last_found_part_index = -1;
297 7269 : partdesc->last_found_count = 0;
298 :
1487 tgl 299 GIC 7269 : partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
300 7269 : partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
1508 rhaas 301 ECB :
1487 tgl 302 : /*
303 : * Assign OIDs from the original array into mapped indexes of the
304 : * result array. The order of OIDs in the former is defined by the
305 : * catalog scan that retrieved them, whereas that in the latter is
306 : * defined by canonicalized representation of the partition bounds.
307 : * Also save leaf-ness of each partition.
308 : */
1487 tgl 309 GIC 21818 : for (i = 0; i < nparts; i++)
310 : {
311 14549 : int index = mapping[i];
312 :
313 14549 : partdesc->oids[index] = oids[i];
1201 314 14549 : partdesc->is_leaf[index] = is_leaf[i];
1487 tgl 315 ECB : }
1201 tgl 316 GIC 7269 : MemoryContextSwitchTo(oldcxt);
1508 rhaas 317 ECB : }
318 :
711 alvherre 319 : /*
703 320 : * Are we working with the partdesc that omits the detached partition, or
321 : * the one that includes it?
322 : *
323 : * Note that if a partition was found by the catalog's scan to have been
324 : * detached, but the pg_inherit tuple saying so was not visible to the
325 : * active snapshot (find_inheritance_children_extended will not have set
326 : * detached_xmin in that case), we consider there to be no "omittable"
327 : * detached partitions.
328 : */
703 alvherre 329 GIC 9873 : is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
330 45 : TransactionIdIsValid(detached_xmin);
331 :
332 : /*
333 : * We have a fully valid partdesc. Reparent it so that it has the right
334 : * lifespan.
1201 tgl 335 ECB : */
711 alvherre 336 CBC 9828 : MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
337 :
338 : /*
339 : * Store it into relcache.
340 : *
341 : * But first, a kluge: if there's an old context for this type of
711 alvherre 342 ECB : * descriptor, it contains an old partition descriptor that may still be
343 : * referenced somewhere. Preserve it, while not leaking it, by
344 : * reattaching it as a child context of the new one. Eventually it will
345 : * get dropped by either RelationClose or RelationClearRelation. (We keep
346 : * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
347 : * detached-partitions in rd_pddcxt.)
348 : */
711 alvherre 349 GIC 9828 : if (is_omit)
350 : {
351 33 : if (rel->rd_pddcxt != NULL)
711 alvherre 352 UIC 0 : MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
711 alvherre 353 GIC 33 : rel->rd_pddcxt = new_pdcxt;
354 33 : rel->rd_partdesc_nodetached = partdesc;
711 alvherre 355 ECB :
356 : /*
357 : * For partdescs built excluding detached partitions, which we save
711 alvherre 358 EUB : * separately, we also record the pg_inherits.xmin of the detached
711 alvherre 359 ECB : * partition that was omitted; this informs a future potential user of
703 360 : * such a cached partdesc to only use it after cross-checking that the
361 : * xmin is indeed visible to the snapshot it is going to be working
362 : * with.
363 : */
703 alvherre 364 GIC 33 : Assert(TransactionIdIsValid(detached_xmin));
711 365 33 : rel->rd_partdesc_nodetached_xmin = detached_xmin;
366 : }
367 : else
368 : {
717 369 9795 : if (rel->rd_pdcxt != NULL)
717 alvherre 370 CBC 2050 : MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
371 9795 : rel->rd_pdcxt = new_pdcxt;
717 alvherre 372 GIC 9795 : rel->rd_partdesc = partdesc;
373 : }
374 :
717 alvherre 375 CBC 9828 : return partdesc;
1508 rhaas 376 ECB : }
377 :
1494 378 : /*
379 : * CreatePartitionDirectory
380 : * Create a new partition directory object.
381 : */
382 : PartitionDirectory
717 alvherre 383 GIC 8436 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
384 : {
1494 rhaas 385 8436 : MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
386 : PartitionDirectory pdir;
387 : HASHCTL ctl;
388 :
845 tgl 389 CBC 8436 : pdir = palloc(sizeof(PartitionDirectoryData));
845 tgl 390 GIC 8436 : pdir->pdir_mcxt = mcxt;
845 tgl 391 ECB :
1494 rhaas 392 GIC 8436 : ctl.keysize = sizeof(Oid);
393 8436 : ctl.entrysize = sizeof(PartitionDirectoryEntry);
394 8436 : ctl.hcxt = mcxt;
1494 rhaas 395 ECB :
1494 rhaas 396 CBC 8436 : pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
397 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
717 alvherre 398 8436 : pdir->omit_detached = omit_detached;
1494 rhaas 399 ECB :
1494 rhaas 400 CBC 8436 : MemoryContextSwitchTo(oldcontext);
1494 rhaas 401 GIC 8436 : return pdir;
1494 rhaas 402 ECB : }
403 :
404 : /*
405 : * PartitionDirectoryLookup
406 : * Look up the partition descriptor for a relation in the directory.
407 : *
408 : * The purpose of this function is to ensure that we get the same
409 : * PartitionDesc for each relation every time we look it up. In the
410 : * face of concurrent DDL, different PartitionDescs may be constructed with
411 : * different views of the catalog state, but any single particular OID
412 : * will always get the same PartitionDesc for as long as the same
413 : * PartitionDirectory is used.
414 : */
415 : PartitionDesc
1494 rhaas 416 GIC 19246 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
417 : {
418 : PartitionDirectoryEntry *pde;
419 19246 : Oid relid = RelationGetRelid(rel);
420 : bool found;
421 :
1494 rhaas 422 CBC 19246 : pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
1494 rhaas 423 GIC 19246 : if (!found)
424 : {
1494 rhaas 425 ECB : /*
426 : * We must keep a reference count on the relation so that the
427 : * PartitionDesc to which we are pointing can't get destroyed.
428 : */
1494 rhaas 429 CBC 11628 : RelationIncrementReferenceCount(rel);
1494 rhaas 430 GIC 11628 : pde->rel = rel;
717 alvherre 431 11628 : pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
1494 rhaas 432 11628 : Assert(pde->pd != NULL);
433 : }
434 19246 : return pde->pd;
1494 rhaas 435 ECB : }
436 :
437 : /*
438 : * DestroyPartitionDirectory
439 : * Destroy a partition directory.
440 : *
441 : * Release the reference counts we're holding.
442 : */
443 : void
1494 rhaas 444 GIC 8096 : DestroyPartitionDirectory(PartitionDirectory pdir)
445 : {
446 : HASH_SEQ_STATUS status;
447 : PartitionDirectoryEntry *pde;
448 :
449 8096 : hash_seq_init(&status, pdir->pdir_hash);
1494 rhaas 450 CBC 19278 : while ((pde = hash_seq_search(&status)) != NULL)
1494 rhaas 451 GIC 11182 : RelationDecrementReferenceCount(pde->rel);
452 8096 : }
453 :
454 : /*
1508 rhaas 455 ECB : * get_default_oid_from_partdesc
456 : *
457 : * Given a partition descriptor, return the OID of the default partition, if
458 : * one exists; else, return InvalidOid.
459 : */
460 : Oid
1508 rhaas 461 GIC 9203 : get_default_oid_from_partdesc(PartitionDesc partdesc)
462 : {
463 9203 : if (partdesc && partdesc->boundinfo &&
464 5527 : partition_bound_has_default(partdesc->boundinfo))
465 629 : return partdesc->oids[partdesc->boundinfo->default_index];
466 :
1508 rhaas 467 CBC 8574 : return InvalidOid;
468 : }
|