Age Owner Branch data TLA Line data Source code
1 : : /*
2 : : * brin.c
3 : : * Implementation of BRIN indexes for Postgres
4 : : *
5 : : * See src/backend/access/brin/README for details.
6 : : *
7 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/brin/brin.c
12 : : *
13 : : * TODO
14 : : * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15 : : */
16 : : #include "postgres.h"
17 : :
18 : : #include "access/brin.h"
19 : : #include "access/brin_page.h"
20 : : #include "access/brin_pageops.h"
21 : : #include "access/brin_xlog.h"
22 : : #include "access/relation.h"
23 : : #include "access/reloptions.h"
24 : : #include "access/relscan.h"
25 : : #include "access/table.h"
26 : : #include "access/tableam.h"
27 : : #include "access/xloginsert.h"
28 : : #include "catalog/index.h"
29 : : #include "catalog/pg_am.h"
30 : : #include "commands/vacuum.h"
31 : : #include "miscadmin.h"
32 : : #include "pgstat.h"
33 : : #include "postmaster/autovacuum.h"
34 : : #include "storage/bufmgr.h"
35 : : #include "storage/freespace.h"
36 : : #include "tcop/tcopprot.h" /* pgrminclude ignore */
37 : : #include "utils/acl.h"
38 : : #include "utils/datum.h"
39 : : #include "utils/fmgrprotos.h"
40 : : #include "utils/guc.h"
41 : : #include "utils/index_selfuncs.h"
42 : : #include "utils/memutils.h"
43 : : #include "utils/rel.h"
44 : : #include "utils/tuplesort.h"
45 : :
46 : : /* Magic numbers for parallel state sharing */
47 : : #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 : : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 : : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 : : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 : : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
52 : :
53 : : /*
54 : : * Status for index builds performed in parallel. This is allocated in a
55 : : * dynamic shared memory segment.
56 : : */
57 : : typedef struct BrinShared
58 : : {
59 : : /*
60 : : * These fields are not modified during the build. They primarily exist
61 : : * for the benefit of worker processes that need to create state
62 : : * corresponding to that used by the leader.
63 : : */
64 : : Oid heaprelid;
65 : : Oid indexrelid;
66 : : bool isconcurrent;
67 : : BlockNumber pagesPerRange;
68 : : int scantuplesortstates;
69 : :
70 : : /*
71 : : * workersdonecv is used to monitor the progress of workers. All parallel
72 : : * participants must indicate that they are done before leader can use
73 : : * results built by the workers (and before leader can write the data into
74 : : * the index).
75 : : */
76 : : ConditionVariable workersdonecv;
77 : :
78 : : /*
79 : : * mutex protects all fields before heapdesc.
80 : : *
81 : : * These fields contain status information of interest to BRIN index
82 : : * builds that must work just the same when an index is built in parallel.
83 : : */
84 : : slock_t mutex;
85 : :
86 : : /*
87 : : * Mutable state that is maintained by workers, and reported back to
88 : : * leader at end of the scans.
89 : : *
90 : : * nparticipantsdone is number of worker processes finished.
91 : : *
92 : : * reltuples is the total number of input heap tuples.
93 : : *
94 : : * indtuples is the total number of tuples that made it into the index.
95 : : */
96 : : int nparticipantsdone;
97 : : double reltuples;
98 : : double indtuples;
99 : :
100 : : /*
101 : : * ParallelTableScanDescData data follows. Can't directly embed here, as
102 : : * implementations of the parallel table scan desc interface might need
103 : : * stronger alignment.
104 : : */
105 : : } BrinShared;
106 : :
107 : : /*
108 : : * Return pointer to a BrinShared's parallel table scan.
109 : : *
110 : : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
111 : : * MAXALIGN.
112 : : */
113 : : #define ParallelTableScanFromBrinShared(shared) \
114 : : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
115 : :
116 : : /*
117 : : * Status for leader in parallel index build.
118 : : */
119 : : typedef struct BrinLeader
120 : : {
121 : : /* parallel context itself */
122 : : ParallelContext *pcxt;
123 : :
124 : : /*
125 : : * nparticipanttuplesorts is the exact number of worker processes
126 : : * successfully launched, plus one leader process if it participates as a
127 : : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
128 : : * participating as a worker).
129 : : */
130 : : int nparticipanttuplesorts;
131 : :
132 : : /*
133 : : * Leader process convenience pointers to shared state (leader avoids TOC
134 : : * lookups).
135 : : *
136 : : * brinshared is the shared state for entire build. sharedsort is the
137 : : * shared, tuplesort-managed state passed to each process tuplesort.
138 : : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
139 : : * required.
140 : : */
141 : : BrinShared *brinshared;
142 : : Sharedsort *sharedsort;
143 : : Snapshot snapshot;
144 : : WalUsage *walusage;
145 : : BufferUsage *bufferusage;
146 : : } BrinLeader;
147 : :
148 : : /*
149 : : * We use a BrinBuildState during initial construction of a BRIN index.
150 : : * The running state is kept in a BrinMemTuple.
151 : : */
152 : : typedef struct BrinBuildState
153 : : {
154 : : Relation bs_irel;
155 : : double bs_numtuples;
156 : : double bs_reltuples;
157 : : Buffer bs_currentInsertBuf;
158 : : BlockNumber bs_pagesPerRange;
159 : : BlockNumber bs_currRangeStart;
160 : : BlockNumber bs_maxRangeStart;
161 : : BrinRevmap *bs_rmAccess;
162 : : BrinDesc *bs_bdesc;
163 : : BrinMemTuple *bs_dtuple;
164 : :
165 : : BrinTuple *bs_emptyTuple;
166 : : Size bs_emptyTupleLen;
167 : : MemoryContext bs_context;
168 : :
169 : : /*
170 : : * bs_leader is only present when a parallel index build is performed, and
171 : : * only in the leader process. (Actually, only the leader process has a
172 : : * BrinBuildState.)
173 : : */
174 : : BrinLeader *bs_leader;
175 : : int bs_worker_id;
176 : :
177 : : /*
178 : : * The sortstate is used by workers (including the leader). It has to be
179 : : * part of the build state, because that's the only thing passed to the
180 : : * build callback etc.
181 : : */
182 : : Tuplesortstate *bs_sortstate;
183 : : } BrinBuildState;
184 : :
185 : : /*
186 : : * We use a BrinInsertState to capture running state spanning multiple
187 : : * brininsert invocations, within the same command.
188 : : */
189 : : typedef struct BrinInsertState
190 : : {
191 : : BrinRevmap *bis_rmAccess;
192 : : BrinDesc *bis_desc;
193 : : BlockNumber bis_pages_per_range;
194 : : } BrinInsertState;
195 : :
196 : : /*
197 : : * Struct used as "opaque" during index scans
198 : : */
199 : : typedef struct BrinOpaque
200 : : {
201 : : BlockNumber bo_pagesPerRange;
202 : : BrinRevmap *bo_rmAccess;
203 : : BrinDesc *bo_bdesc;
204 : : } BrinOpaque;
205 : :
206 : : #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
207 : :
208 : : static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
209 : : BrinRevmap *revmap,
210 : : BlockNumber pagesPerRange,
211 : : BlockNumber tablePages);
212 : : static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
213 : : static void terminate_brin_buildstate(BrinBuildState *state);
214 : : static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
215 : : bool include_partial, double *numSummarized, double *numExisting);
216 : : static void form_and_insert_tuple(BrinBuildState *state);
217 : : static void form_and_spill_tuple(BrinBuildState *state);
218 : : static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
219 : : BrinTuple *b);
220 : : static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
221 : : static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
222 : : BrinMemTuple *dtup, const Datum *values, const bool *nulls);
223 : : static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
224 : : static void brin_fill_empty_ranges(BrinBuildState *state,
225 : : BlockNumber prevRange, BlockNumber maxRange);
226 : :
227 : : /* parallel index builds */
228 : : static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
229 : : bool isconcurrent, int request);
230 : : static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
231 : : static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
232 : : static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
233 : : Relation heap, Relation index);
234 : : static void _brin_parallel_scan_and_build(BrinBuildState *buildstate,
235 : : BrinShared *brinshared,
236 : : Sharedsort *sharedsort,
237 : : Relation heap, Relation index,
238 : : int sortmem, bool progress);
239 : :
240 : : /*
241 : : * BRIN handler function: return IndexAmRoutine with access method parameters
242 : : * and callbacks.
243 : : */
244 : : Datum
3010 tgl@sss.pgh.pa.us 245 :CBC 1126 : brinhandler(PG_FUNCTION_ARGS)
246 : : {
247 : 1126 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
248 : :
249 : 1126 : amroutine->amstrategies = 0;
250 : 1126 : amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
1476 akorotkov@postgresql 251 : 1126 : amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
3010 tgl@sss.pgh.pa.us 252 : 1126 : amroutine->amcanorder = false;
253 : 1126 : amroutine->amcanorderbyop = false;
254 : 1126 : amroutine->amcanbackward = false;
255 : 1126 : amroutine->amcanunique = false;
256 : 1126 : amroutine->amcanmulticol = true;
257 : 1126 : amroutine->amoptionalkey = true;
258 : 1126 : amroutine->amsearcharray = false;
259 : 1126 : amroutine->amsearchnulls = true;
260 : 1126 : amroutine->amstorage = true;
261 : 1126 : amroutine->amclusterable = false;
262 : 1126 : amroutine->ampredlocks = false;
2615 rhaas@postgresql.org 263 : 1126 : amroutine->amcanparallel = false;
128 tomas.vondra@postgre 264 :GNC 1126 : amroutine->amcanbuildparallel = true;
2199 teodor@sigaev.ru 265 :CBC 1126 : amroutine->amcaninclude = false;
1551 akapila@postgresql.o 266 : 1126 : amroutine->amusemaintenanceworkmem = false;
391 tomas.vondra@postgre 267 : 1126 : amroutine->amsummarizing = true;
1551 akapila@postgresql.o 268 : 1126 : amroutine->amparallelvacuumoptions =
269 : : VACUUM_OPTION_PARALLEL_CLEANUP;
3010 tgl@sss.pgh.pa.us 270 : 1126 : amroutine->amkeytype = InvalidOid;
271 : :
272 : 1126 : amroutine->ambuild = brinbuild;
273 : 1126 : amroutine->ambuildempty = brinbuildempty;
274 : 1126 : amroutine->aminsert = brininsert;
141 tomas.vondra@postgre 275 :GNC 1126 : amroutine->aminsertcleanup = brininsertcleanup;
3010 tgl@sss.pgh.pa.us 276 :CBC 1126 : amroutine->ambulkdelete = brinbulkdelete;
277 : 1126 : amroutine->amvacuumcleanup = brinvacuumcleanup;
278 : 1126 : amroutine->amcanreturn = NULL;
279 : 1126 : amroutine->amcostestimate = brincostestimate;
280 : 1126 : amroutine->amoptions = brinoptions;
2801 281 : 1126 : amroutine->amproperty = NULL;
1839 alvherre@alvh.no-ip. 282 : 1126 : amroutine->ambuildphasename = NULL;
3010 tgl@sss.pgh.pa.us 283 : 1126 : amroutine->amvalidate = brinvalidate;
1352 284 : 1126 : amroutine->amadjustmembers = NULL;
3010 285 : 1126 : amroutine->ambeginscan = brinbeginscan;
286 : 1126 : amroutine->amrescan = brinrescan;
287 : 1126 : amroutine->amgettuple = NULL;
288 : 1126 : amroutine->amgetbitmap = bringetbitmap;
289 : 1126 : amroutine->amendscan = brinendscan;
290 : 1126 : amroutine->ammarkpos = NULL;
291 : 1126 : amroutine->amrestrpos = NULL;
2637 rhaas@postgresql.org 292 : 1126 : amroutine->amestimateparallelscan = NULL;
293 : 1126 : amroutine->aminitparallelscan = NULL;
294 : 1126 : amroutine->amparallelrescan = NULL;
295 : :
3010 tgl@sss.pgh.pa.us 296 : 1126 : PG_RETURN_POINTER(amroutine);
297 : : }
298 : :
299 : : /*
300 : : * Initialize a BrinInsertState to maintain state to be used across multiple
301 : : * tuple inserts, within the same command.
302 : : */
303 : : static BrinInsertState *
141 tomas.vondra@postgre 304 :GNC 542 : initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
305 : : {
306 : : BrinInsertState *bistate;
307 : : MemoryContext oldcxt;
308 : :
309 : 542 : oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
310 : 542 : bistate = palloc0(sizeof(BrinInsertState));
311 : 542 : bistate->bis_desc = brin_build_desc(idxRel);
312 : 542 : bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
313 : : &bistate->bis_pages_per_range);
314 : 542 : indexInfo->ii_AmCache = bistate;
315 : 542 : MemoryContextSwitchTo(oldcxt);
316 : :
317 : 542 : return bistate;
318 : : }
319 : :
320 : : /*
321 : : * A tuple in the heap is being inserted. To keep a brin index up to date,
322 : : * we need to obtain the relevant index tuple and compare its stored values
323 : : * with those of the new tuple. If the tuple values are not consistent with
324 : : * the summary tuple, we need to update the index tuple.
325 : : *
326 : : * If autosummarization is enabled, check if we need to summarize the previous
327 : : * page range.
328 : : *
329 : : * If the range is not currently summarized (i.e. the revmap returns NULL for
330 : : * it), there's nothing to do for this tuple.
331 : : */
332 : : bool
3010 tgl@sss.pgh.pa.us 333 :CBC 63000 : brininsert(Relation idxRel, Datum *values, bool *nulls,
334 : : ItemPointer heaptid, Relation heapRel,
335 : : IndexUniqueCheck checkUnique,
336 : : bool indexUnchanged,
337 : : IndexInfo *indexInfo)
338 : : {
339 : : BlockNumber pagesPerRange;
340 : : BlockNumber origHeapBlk;
341 : : BlockNumber heapBlk;
141 tomas.vondra@postgre 342 :GNC 63000 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
343 : : BrinRevmap *revmap;
344 : : BrinDesc *bdesc;
3446 alvherre@alvh.no-ip. 345 :CBC 63000 : Buffer buf = InvalidBuffer;
346 : 63000 : MemoryContext tupcxt = NULL;
2621 tgl@sss.pgh.pa.us 347 : 63000 : MemoryContext oldcxt = CurrentMemoryContext;
2570 alvherre@alvh.no-ip. 348 [ + - - + : 63000 : bool autosummarize = BrinGetAutoSummarize(idxRel);
+ + ]
349 : :
350 : : /*
351 : : * If first time through in this statement, initialize the insert state
352 : : * that we keep for all the inserts in the command.
353 : : */
141 tomas.vondra@postgre 354 [ + + ]:GNC 63000 : if (!bistate)
355 : 542 : bistate = initialize_brin_insertstate(idxRel, indexInfo);
356 : :
357 : 63000 : revmap = bistate->bis_rmAccess;
358 : 63000 : bdesc = bistate->bis_desc;
359 : 63000 : pagesPerRange = bistate->bis_pages_per_range;
360 : :
361 : : /*
362 : : * origHeapBlk is the block number where the insertion occurred. heapBlk
363 : : * is the first block in the corresponding page range.
364 : : */
2570 alvherre@alvh.no-ip. 365 :CBC 63000 : origHeapBlk = ItemPointerGetBlockNumber(heaptid);
366 : 63000 : heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
367 : :
368 : : for (;;)
3446 alvherre@alvh.no-ip. 369 :UBC 0 : {
3446 alvherre@alvh.no-ip. 370 :CBC 63000 : bool need_insert = false;
371 : : OffsetNumber off;
372 : : BrinTuple *brtup;
373 : : BrinMemTuple *dtup;
374 : :
375 [ - + ]: 63000 : CHECK_FOR_INTERRUPTS();
376 : :
377 : : /*
378 : : * If auto-summarization is enabled and we just inserted the first
379 : : * tuple into the first block of a new non-first page range, request a
380 : : * summarization run of the previous range.
381 : : */
2570 382 [ + + + + ]: 63000 : if (autosummarize &&
383 [ + - ]: 78 : heapBlk > 0 &&
384 [ + + ]: 78 : heapBlk == origHeapBlk &&
385 : 78 : ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
386 : : {
387 : 4 : BlockNumber lastPageRange = heapBlk - 1;
388 : : BrinTuple *lastPageTuple;
389 : :
390 : : lastPageTuple =
391 : 4 : brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
392 : : NULL, BUFFER_LOCK_SHARE);
393 [ + + ]: 4 : if (!lastPageTuple)
394 : : {
395 : : bool recorded;
396 : :
2223 397 : 3 : recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
398 : : RelationGetRelid(idxRel),
399 : : lastPageRange);
400 [ - + ]: 3 : if (!recorded)
2223 alvherre@alvh.no-ip. 401 [ # # ]:UBC 0 : ereport(LOG,
402 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
403 : : errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
404 : : RelationGetRelationName(idxRel),
405 : : lastPageRange)));
406 : : }
407 : : else
2511 alvherre@alvh.no-ip. 408 :CBC 1 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
409 : : }
410 : :
2570 411 : 63000 : brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
412 : : NULL, BUFFER_LOCK_SHARE);
413 : :
414 : : /* if range is unsummarized, there's nothing to do */
3446 415 [ + + ]: 63000 : if (!brtup)
416 : 39060 : break;
417 : :
418 : : /* First time through in this brininsert call? */
2621 tgl@sss.pgh.pa.us 419 [ + - ]: 23940 : if (tupcxt == NULL)
420 : : {
3446 alvherre@alvh.no-ip. 421 : 23940 : tupcxt = AllocSetContextCreate(CurrentMemoryContext,
422 : : "brininsert cxt",
423 : : ALLOCSET_DEFAULT_SIZES);
2621 tgl@sss.pgh.pa.us 424 : 23940 : MemoryContextSwitchTo(tupcxt);
425 : : }
426 : :
2564 alvherre@alvh.no-ip. 427 : 23940 : dtup = brin_deform_tuple(bdesc, brtup, NULL);
428 : :
1118 tomas.vondra@postgre 429 : 23940 : need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
430 : :
3446 alvherre@alvh.no-ip. 431 [ + + ]: 23940 : if (!need_insert)
432 : : {
433 : : /*
434 : : * The tuple is consistent with the new values, so there's nothing
435 : : * to do.
436 : : */
437 : 11986 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
438 : : }
439 : : else
440 : : {
2916 kgrittn@postgresql.o 441 : 11954 : Page page = BufferGetPage(buf);
3446 alvherre@alvh.no-ip. 442 : 11954 : ItemId lp = PageGetItemId(page, off);
443 : : Size origsz;
444 : : BrinTuple *origtup;
445 : : Size newsz;
446 : : BrinTuple *newtup;
447 : : bool samepage;
448 : :
449 : : /*
450 : : * Make a copy of the old tuple, so that we can compare it after
451 : : * re-acquiring the lock.
452 : : */
453 : 11954 : origsz = ItemIdGetLength(lp);
2564 454 : 11954 : origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
455 : :
456 : : /*
457 : : * Before releasing the lock, check if we can attempt a same-page
458 : : * update. Another process could insert a tuple concurrently in
459 : : * the same page though, so downstream we must be prepared to cope
460 : : * if this turns out to not be possible after all.
461 : : */
3445 462 : 11954 : newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
3446 463 : 11954 : samepage = brin_can_do_samepage_update(buf, origsz, newsz);
464 : 11954 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
465 : :
466 : : /*
467 : : * Try to update the tuple. If this doesn't work for whatever
468 : : * reason, we need to restart from the top; the revmap might be
469 : : * pointing at a different tuple for this block now, so we need to
470 : : * recompute to ensure both our new heap tuple and the other
471 : : * inserter's are covered by the combined tuple. It might be that
472 : : * we don't need to update at all.
473 : : */
474 [ - + ]: 11954 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
475 : : buf, off, origtup, origsz, newtup, newsz,
476 : : samepage))
477 : : {
478 : : /* no luck; start over */
151 nathan@postgresql.or 479 :UNC 0 : MemoryContextReset(tupcxt);
3446 alvherre@alvh.no-ip. 480 :UBC 0 : continue;
481 : : }
482 : : }
483 : :
484 : : /* success! */
3446 alvherre@alvh.no-ip. 485 :CBC 23940 : break;
486 : : }
487 : :
488 [ + + ]: 63000 : if (BufferIsValid(buf))
489 : 23941 : ReleaseBuffer(buf);
2621 tgl@sss.pgh.pa.us 490 : 63000 : MemoryContextSwitchTo(oldcxt);
491 [ + + ]: 63000 : if (tupcxt != NULL)
3446 alvherre@alvh.no-ip. 492 : 23940 : MemoryContextDelete(tupcxt);
493 : :
3010 tgl@sss.pgh.pa.us 494 : 63000 : return false;
495 : : }
496 : :
497 : : /*
498 : : * Callback to clean up the BrinInsertState once all tuple inserts are done.
499 : : */
500 : : void
141 tomas.vondra@postgre 501 :GNC 542 : brininsertcleanup(IndexInfo *indexInfo)
502 : : {
503 : 542 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
504 : :
505 [ - + ]: 542 : Assert(bistate);
506 : :
507 : : /*
508 : : * Clean up the revmap. Note that the brinDesc has already been cleaned up
509 : : * as part of its own memory context.
510 : : */
511 : 542 : brinRevmapTerminate(bistate->bis_rmAccess);
512 : 542 : bistate->bis_rmAccess = NULL;
513 : 542 : bistate->bis_desc = NULL;
514 : 542 : }
515 : :
516 : : /*
517 : : * Initialize state for a BRIN index scan.
518 : : *
519 : : * We read the metapage here to determine the pages-per-range number that this
520 : : * index was built with. Note that since this cannot be changed while we're
521 : : * holding lock on index, it's not necessary to recompute it during brinrescan.
522 : : */
523 : : IndexScanDesc
3010 tgl@sss.pgh.pa.us 524 :CBC 1473 : brinbeginscan(Relation r, int nkeys, int norderbys)
525 : : {
526 : : IndexScanDesc scan;
527 : : BrinOpaque *opaque;
528 : :
3446 alvherre@alvh.no-ip. 529 : 1473 : scan = RelationGetIndexScan(r, nkeys, norderbys);
530 : :
580 peter@eisentraut.org 531 : 1473 : opaque = palloc_object(BrinOpaque);
219 tmunro@postgresql.or 532 :GNC 1473 : opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
3446 alvherre@alvh.no-ip. 533 :CBC 1473 : opaque->bo_bdesc = brin_build_desc(r);
534 : 1473 : scan->opaque = opaque;
535 : :
3010 tgl@sss.pgh.pa.us 536 : 1473 : return scan;
537 : : }
538 : :
539 : : /*
540 : : * Execute the index scan.
541 : : *
542 : : * This works by reading index TIDs from the revmap, and obtaining the index
543 : : * tuples pointed to by them; the summary values in the index tuples are
544 : : * compared to the scan keys. We return into the TID bitmap all the pages in
545 : : * ranges corresponding to index tuples that match the scan keys.
546 : : *
547 : : * If a TID from the revmap is read as InvalidTID, we know that range is
548 : : * unsummarized. Pages in those ranges need to be returned regardless of scan
549 : : * keys.
550 : : */
551 : : int64
552 : 1473 : bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
553 : : {
3446 alvherre@alvh.no-ip. 554 : 1473 : Relation idxRel = scan->indexRelation;
555 : 1473 : Buffer buf = InvalidBuffer;
556 : : BrinDesc *bdesc;
557 : : Oid heapOid;
558 : : Relation heapRel;
559 : : BrinOpaque *opaque;
560 : : BlockNumber nblocks;
561 : : BlockNumber heapBlk;
562 : 1473 : int totalpages = 0;
563 : : FmgrInfo *consistentFn;
564 : : MemoryContext oldcxt;
565 : : MemoryContext perRangeCxt;
566 : : BrinMemTuple *dtup;
2524 bruce@momjian.us 567 : 1473 : BrinTuple *btup = NULL;
2564 alvherre@alvh.no-ip. 568 : 1473 : Size btupsz = 0;
569 : : ScanKey **keys,
570 : : **nullkeys;
571 : : int *nkeys,
572 : : *nnullkeys;
573 : : char *ptr;
574 : : Size len;
575 : : char *tmp PG_USED_FOR_ASSERTS_ONLY;
576 : :
3446 577 : 1473 : opaque = (BrinOpaque *) scan->opaque;
578 : 1473 : bdesc = opaque->bo_bdesc;
579 [ - + - - : 1473 : pgstat_count_index_scan(idxRel);
+ - ]
580 : :
581 : : /*
582 : : * We need to know the size of the table so that we know how long to
583 : : * iterate on the revmap.
584 : : */
585 : 1473 : heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
1910 andres@anarazel.de 586 : 1473 : heapRel = table_open(heapOid, AccessShareLock);
3446 alvherre@alvh.no-ip. 587 : 1473 : nblocks = RelationGetNumberOfBlocks(heapRel);
1910 andres@anarazel.de 588 : 1473 : table_close(heapRel, AccessShareLock);
589 : :
590 : : /*
591 : : * Make room for the consistent support procedures of indexed columns. We
592 : : * don't look them up here; we do that lazily the first time we see a scan
593 : : * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
594 : : */
580 peter@eisentraut.org 595 : 1473 : consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
596 : :
597 : : /*
598 : : * Make room for per-attribute lists of scan keys that we'll pass to the
599 : : * consistent support procedure. We don't know which attributes have scan
600 : : * keys, so we allocate space for all attributes. That may use more memory
601 : : * but it's probably cheaper than determining which attributes are used.
602 : : *
603 : : * We keep null and regular keys separate, so that we can pass just the
604 : : * regular keys to the consistent function easily.
605 : : *
606 : : * To reduce the allocation overhead, we allocate one big chunk and then
607 : : * carve it into smaller arrays ourselves. All the pieces have exactly the
608 : : * same lifetime, so that's OK.
609 : : *
610 : : * XXX The widest index can have 32 attributes, so the amount of wasted
611 : : * memory is negligible. We could invent a more compact approach (with
612 : : * just space for used attributes) but that would make the matching more
613 : : * complex so it's not a good trade-off.
614 : : */
1118 tomas.vondra@postgre 615 : 1473 : len =
616 : 1473 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
617 : 1473 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
618 : 1473 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
619 : 1473 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
620 : 1473 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
621 : 1473 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
622 : :
623 : 1473 : ptr = palloc(len);
624 : 1473 : tmp = ptr;
625 : :
626 : 1473 : keys = (ScanKey **) ptr;
627 : 1473 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
628 : :
629 : 1473 : nullkeys = (ScanKey **) ptr;
630 : 1473 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
631 : :
632 : 1473 : nkeys = (int *) ptr;
633 : 1473 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
634 : :
635 : 1473 : nnullkeys = (int *) ptr;
636 : 1473 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
637 : :
638 [ + + ]: 34989 : for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
639 : : {
640 : 33516 : keys[i] = (ScanKey *) ptr;
641 : 33516 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
642 : :
643 : 33516 : nullkeys[i] = (ScanKey *) ptr;
644 : 33516 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
645 : : }
646 : :
647 [ - + ]: 1473 : Assert(tmp + len == ptr);
648 : :
649 : : /* zero the number of keys */
650 : 1473 : memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
651 : 1473 : memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
652 : :
653 : : /* Preprocess the scan keys - split them into per-attribute arrays. */
599 drowley@postgresql.o 654 [ + + ]: 2946 : for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
655 : : {
1118 tomas.vondra@postgre 656 : 1473 : ScanKey key = &scan->keyData[keyno];
657 : 1473 : AttrNumber keyattno = key->sk_attno;
658 : :
659 : : /*
660 : : * The collation of the scan key must match the collation used in the
661 : : * index column (but only if the search is not IS NULL/ IS NOT NULL).
662 : : * Otherwise we shouldn't be using this index ...
663 : : */
664 [ + + - + ]: 1473 : Assert((key->sk_flags & SK_ISNULL) ||
665 : : (key->sk_collation ==
666 : : TupleDescAttr(bdesc->bd_tupdesc,
667 : : keyattno - 1)->attcollation));
668 : :
669 : : /*
670 : : * First time we see this index attribute, so init as needed.
671 : : *
672 : : * This is a bit of an overkill - we don't know how many scan keys are
673 : : * there for this attribute, so we simply allocate the largest number
674 : : * possible (as if all keys were for this attribute). This may waste a
675 : : * bit of memory, but we only expect small number of scan keys in
676 : : * general, so this should be negligible, and repeated repalloc calls
677 : : * are not free either.
678 : : */
679 [ + - ]: 1473 : if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
680 : : {
681 : : FmgrInfo *tmp;
682 : :
683 : : /* First time we see this attribute, so no key/null keys. */
684 [ - + ]: 1473 : Assert(nkeys[keyattno - 1] == 0);
685 [ - + ]: 1473 : Assert(nnullkeys[keyattno - 1] == 0);
686 : :
687 : 1473 : tmp = index_getprocinfo(idxRel, keyattno,
688 : : BRIN_PROCNUM_CONSISTENT);
689 : 1473 : fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
690 : : CurrentMemoryContext);
691 : : }
692 : :
693 : : /* Add key to the proper per-attribute array. */
694 [ + + ]: 1473 : if (key->sk_flags & SK_ISNULL)
695 : : {
696 : 18 : nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
697 : 18 : nnullkeys[keyattno - 1]++;
698 : : }
699 : : else
700 : : {
701 : 1455 : keys[keyattno - 1][nkeys[keyattno - 1]] = key;
702 : 1455 : nkeys[keyattno - 1]++;
703 : : }
704 : : }
705 : :
706 : : /* allocate an initial in-memory tuple, out of the per-range memcxt */
2564 alvherre@alvh.no-ip. 707 : 1473 : dtup = brin_new_memtuple(bdesc);
708 : :
709 : : /*
710 : : * Setup and use a per-range memory context, which is reset every time we
711 : : * loop below. This avoids having to free the tuples within the loop.
712 : : */
3446 713 : 1473 : perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
714 : : "bringetbitmap cxt",
715 : : ALLOCSET_DEFAULT_SIZES);
716 : 1473 : oldcxt = MemoryContextSwitchTo(perRangeCxt);
717 : :
718 : : /*
719 : : * Now scan the revmap. We start by querying for heap page 0,
720 : : * incrementing by the number of pages per range; this gives us a full
721 : : * view of the table.
722 : : */
723 [ + + ]: 97299 : for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
724 : : {
725 : : bool addrange;
2564 726 : 95826 : bool gottuple = false;
727 : : BrinTuple *tup;
728 : : OffsetNumber off;
729 : : Size size;
730 : :
3446 731 [ - + ]: 95826 : CHECK_FOR_INTERRUPTS();
732 : :
151 nathan@postgresql.or 733 :GNC 95826 : MemoryContextReset(perRangeCxt);
734 : :
3446 alvherre@alvh.no-ip. 735 :CBC 95826 : tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
736 : : &off, &size, BUFFER_LOCK_SHARE);
737 [ + + ]: 95826 : if (tup)
738 : : {
2564 739 : 94968 : gottuple = true;
740 : 94968 : btup = brin_copy_tuple(tup, size, btup, &btupsz);
3446 741 : 94968 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
742 : : }
743 : :
744 : : /*
745 : : * For page ranges with no indexed tuple, we must return the whole
746 : : * range; otherwise, compare it to the scan keys.
747 : : */
2564 748 [ + + ]: 95826 : if (!gottuple)
749 : : {
3446 alvherre@alvh.no-ip. 750 :GBC 858 : addrange = true;
751 : : }
752 : : else
753 : : {
2564 alvherre@alvh.no-ip. 754 :CBC 94968 : dtup = brin_deform_tuple(bdesc, btup, dtup);
3446 755 [ - + ]: 94968 : if (dtup->bt_placeholder)
756 : : {
757 : : /*
758 : : * Placeholder tuples are always returned, regardless of the
759 : : * values stored in them.
760 : : */
3446 alvherre@alvh.no-ip. 761 :UBC 0 : addrange = true;
762 : : }
763 : : else
764 : : {
765 : : int attno;
766 : :
767 : : /*
768 : : * Compare scan keys with summary values stored for the range.
769 : : * If scan keys are matched, the page range must be added to
770 : : * the bitmap. We initially assume the range needs to be
771 : : * added; in particular this serves the case where there are
772 : : * no keys.
773 : : */
3446 alvherre@alvh.no-ip. 774 :CBC 94968 : addrange = true;
1118 tomas.vondra@postgre 775 [ + + ]: 2352034 : for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
776 : : {
777 : : BrinValues *bval;
778 : : Datum add;
779 : : Oid collation;
780 : :
781 : : /*
782 : : * skip attributes without any scan keys (both regular and
783 : : * IS [NOT] NULL)
784 : : */
785 [ + + + + ]: 2283867 : if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
786 : 2188899 : continue;
787 : :
788 : 94968 : bval = &dtup->bt_columns[attno - 1];
789 : :
790 : : /*
791 : : * If the BRIN tuple indicates that this range is empty,
792 : : * we can skip it: there's nothing to match. We don't
793 : : * need to examine the next columns.
794 : : */
331 795 [ - + ]: 94968 : if (dtup->bt_empty_range)
796 : : {
331 tomas.vondra@postgre 797 :UBC 0 : addrange = false;
798 : 0 : break;
799 : : }
800 : :
801 : : /*
802 : : * First check if there are any IS [NOT] NULL scan keys,
803 : : * and if we're violating them. In that case we can
804 : : * terminate early, without invoking the support function.
805 : : *
806 : : * As there may be more keys, we can only determine
807 : : * mismatch within this loop.
808 : : */
1118 tomas.vondra@postgre 809 [ + - ]:CBC 94968 : if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
810 [ + + ]: 94968 : !check_null_keys(bval, nullkeys[attno - 1],
811 : 94968 : nnullkeys[attno - 1]))
812 : : {
813 : : /*
814 : : * If any of the IS [NOT] NULL keys failed, the page
815 : : * range as a whole can't pass. So terminate the loop.
816 : : */
817 : 498 : addrange = false;
818 : 498 : break;
819 : : }
820 : :
821 : : /*
822 : : * So either there are no IS [NOT] NULL keys, or all
823 : : * passed. If there are no regular scan keys, we're done -
824 : : * the page range matches. If there are regular keys, but
825 : : * the page range is marked as 'all nulls' it can't
826 : : * possibly pass (we're assuming the operators are
827 : : * strict).
828 : : */
829 : :
830 : : /* No regular scan keys - page range as a whole passes. */
831 [ + + ]: 94470 : if (!nkeys[attno - 1])
832 : 618 : continue;
833 : :
834 [ + - - + ]: 93852 : Assert((nkeys[attno - 1] > 0) &&
835 : : (nkeys[attno - 1] <= scan->numberOfKeys));
836 : :
837 : : /* If it is all nulls, it cannot possibly be consistent. */
838 [ + + ]: 93852 : if (bval->bv_allnulls)
839 : : {
840 : 189 : addrange = false;
841 : 189 : break;
842 : : }
843 : :
844 : : /*
845 : : * Collation from the first key (has to be the same for
846 : : * all keys for the same attribute).
847 : : */
1115 848 : 93663 : collation = keys[attno - 1][0]->sk_collation;
849 : :
850 : : /*
851 : : * Check whether the scan key is consistent with the page
852 : : * range values; if so, have the pages in the range added
853 : : * to the output bitmap.
854 : : *
855 : : * The opclass may or may not support processing of
856 : : * multiple scan keys. We can determine that based on the
857 : : * number of arguments - functions with extra parameter
858 : : * (number of scan keys) do support this, otherwise we
859 : : * have to simply pass the scan keys one by one.
860 : : */
861 [ + + ]: 93663 : if (consistentFn[attno - 1].fn_nargs >= 4)
862 : : {
863 : : /* Check all keys at once */
864 : 19797 : add = FunctionCall4Coll(&consistentFn[attno - 1],
865 : : collation,
866 : : PointerGetDatum(bdesc),
867 : : PointerGetDatum(bval),
868 : 19797 : PointerGetDatum(keys[attno - 1]),
869 : 19797 : Int32GetDatum(nkeys[attno - 1]));
870 : 19797 : addrange = DatumGetBool(add);
871 : : }
872 : : else
873 : : {
874 : : /*
875 : : * Check keys one by one
876 : : *
877 : : * When there are multiple scan keys, failure to meet
878 : : * the criteria for a single one of them is enough to
879 : : * discard the range as a whole, so break out of the
880 : : * loop as soon as a false return value is obtained.
881 : : */
882 : : int keyno;
883 : :
884 [ + + ]: 129039 : for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
885 : : {
886 : 73866 : add = FunctionCall3Coll(&consistentFn[attno - 1],
887 : 73866 : keys[attno - 1][keyno]->sk_collation,
888 : : PointerGetDatum(bdesc),
889 : : PointerGetDatum(bval),
890 : 73866 : PointerGetDatum(keys[attno - 1][keyno]));
891 : 73866 : addrange = DatumGetBool(add);
892 [ + + ]: 73866 : if (!addrange)
893 : 18693 : break;
894 : : }
895 : : }
896 : :
897 : : /*
898 : : * If we found a scan key eliminating the range, no need
899 : : * to check additional ones.
900 : : */
420 901 [ + + ]: 93663 : if (!addrange)
902 : 26114 : break;
903 : : }
904 : : }
905 : : }
906 : :
907 : : /* add the pages in the range to the output bitmap, if needed */
3446 alvherre@alvh.no-ip. 908 [ + + ]: 95826 : if (addrange)
909 : : {
910 : : BlockNumber pageno;
911 : :
912 : 69025 : for (pageno = heapBlk;
1103 tomas.vondra@postgre 913 [ + + ]: 143010 : pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
3446 alvherre@alvh.no-ip. 914 : 73985 : pageno++)
915 : : {
916 : 73985 : MemoryContextSwitchTo(oldcxt);
917 : 73985 : tbm_add_page(tbm, pageno);
918 : 73985 : totalpages++;
919 : 73985 : MemoryContextSwitchTo(perRangeCxt);
920 : : }
921 : : }
922 : : }
923 : :
924 : 1473 : MemoryContextSwitchTo(oldcxt);
925 : 1473 : MemoryContextDelete(perRangeCxt);
926 : :
927 [ + - ]: 1473 : if (buf != InvalidBuffer)
928 : 1473 : ReleaseBuffer(buf);
929 : :
930 : : /*
931 : : * XXX We have an approximation of the number of *pages* that our scan
932 : : * returns, but we don't have a precise idea of the number of heap tuples
933 : : * involved.
934 : : */
3010 tgl@sss.pgh.pa.us 935 : 1473 : return totalpages * 10;
936 : : }
937 : :
938 : : /*
939 : : * Re-initialize state for a BRIN index scan
940 : : */
941 : : void
942 : 1473 : brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
943 : : ScanKey orderbys, int norderbys)
944 : : {
945 : : /*
946 : : * Other index AMs preprocess the scan keys at this point, or sometime
947 : : * early during the scan; this lets them optimize by removing redundant
948 : : * keys, or doing early returns when they are impossible to satisfy; see
949 : : * _bt_preprocess_keys for an example. Something like that could be added
950 : : * here someday, too.
951 : : */
952 : :
3446 alvherre@alvh.no-ip. 953 [ + - + - ]: 1473 : if (scankey && scan->numberOfKeys > 0)
954 : 1473 : memmove(scan->keyData, scankey,
955 : 1473 : scan->numberOfKeys * sizeof(ScanKeyData));
956 : 1473 : }
957 : :
958 : : /*
959 : : * Close down a BRIN index scan
960 : : */
961 : : void
3010 tgl@sss.pgh.pa.us 962 : 1473 : brinendscan(IndexScanDesc scan)
963 : : {
3446 alvherre@alvh.no-ip. 964 : 1473 : BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
965 : :
966 : 1473 : brinRevmapTerminate(opaque->bo_rmAccess);
967 : 1473 : brin_free_desc(opaque->bo_bdesc);
968 : 1473 : pfree(opaque);
969 : 1473 : }
970 : :
971 : : /*
972 : : * Per-heap-tuple callback for table_index_build_scan.
973 : : *
974 : : * Note we don't worry about the page range at the end of the table here; it is
975 : : * present in the build state struct after we're called the last time, but not
976 : : * inserted into the index. Caller must ensure to do so, if appropriate.
977 : : */
978 : : static void
979 : 365210 : brinbuildCallback(Relation index,
980 : : ItemPointer tid,
981 : : Datum *values,
982 : : bool *isnull,
983 : : bool tupleIsAlive,
984 : : void *brstate)
985 : : {
986 : 365210 : BrinBuildState *state = (BrinBuildState *) brstate;
987 : : BlockNumber thisblock;
988 : :
1619 andres@anarazel.de 989 : 365210 : thisblock = ItemPointerGetBlockNumber(tid);
990 : :
991 : : /*
992 : : * If we're in a block that belongs to a future range, summarize what
993 : : * we've got and start afresh. Note the scan might have skipped many
994 : : * pages, if they were devoid of live tuples; make sure to insert index
995 : : * tuples for those too.
996 : : */
3446 alvherre@alvh.no-ip. 997 [ + + ]: 366344 : while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
998 : : {
999 : :
1000 : : BRIN_elog((DEBUG2,
1001 : : "brinbuildCallback: completed a range: %u--%u",
1002 : : state->bs_currRangeStart,
1003 : : state->bs_currRangeStart + state->bs_pagesPerRange));
1004 : :
1005 : : /* create the index tuple and insert it */
1006 : 1134 : form_and_insert_tuple(state);
1007 : :
1008 : : /* set state to correspond to the next range */
1009 : 1134 : state->bs_currRangeStart += state->bs_pagesPerRange;
1010 : :
1011 : : /* re-initialize state for it */
1012 : 1134 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1013 : : }
1014 : :
1015 : : /* Accumulate the current tuple into the running state */
1118 tomas.vondra@postgre 1016 : 365210 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1017 : : values, isnull);
3446 alvherre@alvh.no-ip. 1018 : 365210 : }
1019 : :
1020 : : /*
1021 : : * Per-heap-tuple callback for table_index_build_scan with parallelism.
1022 : : *
1023 : : * A version of the callback used by parallel index builds. The main difference
1024 : : * is that instead of writing the BRIN tuples into the index, we write them
1025 : : * into a shared tuplesort, and leave the insertion up to the leader (which may
1026 : : * reorder them a bit etc.). The callback also does not generate empty ranges,
1027 : : * those will be added by the leader when merging results from workers.
1028 : : */
1029 : : static void
128 tomas.vondra@postgre 1030 :GNC 4501 : brinbuildCallbackParallel(Relation index,
1031 : : ItemPointer tid,
1032 : : Datum *values,
1033 : : bool *isnull,
1034 : : bool tupleIsAlive,
1035 : : void *brstate)
1036 : : {
1037 : 4501 : BrinBuildState *state = (BrinBuildState *) brstate;
1038 : : BlockNumber thisblock;
1039 : :
1040 : 4501 : thisblock = ItemPointerGetBlockNumber(tid);
1041 : :
1042 : : /*
1043 : : * If we're in a block that belongs to a different range, summarize what
1044 : : * we've got and start afresh. Note the scan might have skipped many
1045 : : * pages, if they were devoid of live tuples; we do not create empty BRIN
1046 : : * ranges here - the leader is responsible for filling them in.
1047 : : *
1048 : : * Unlike serial builds, parallel index builds allow synchronized seqscans
1049 : : * (because that's what parallel scans do). This means the block may wrap
1050 : : * around to the beginning of the relation, so the condition needs to
1051 : : * check for both future and past ranges.
1052 : : */
106 1053 [ + - ]: 4501 : if ((thisblock < state->bs_currRangeStart) ||
1054 [ + + ]: 4501 : (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1055 : : {
1056 : :
1057 : : BRIN_elog((DEBUG2,
1058 : : "brinbuildCallbackParallel: completed a range: %u--%u",
1059 : : state->bs_currRangeStart,
1060 : : state->bs_currRangeStart + state->bs_pagesPerRange));
1061 : :
1062 : : /* create the index tuple and write it into the tuplesort */
128 1063 : 23 : form_and_spill_tuple(state);
1064 : :
1065 : : /*
1066 : : * Set state to correspond to the next range (for this block).
1067 : : *
1068 : : * This skips ranges that are either empty (and so we don't get any
1069 : : * tuples to summarize), or processed by other workers. We can't
1070 : : * differentiate those cases here easily, so we leave it up to the
1071 : : * leader to fill empty ranges where needed.
1072 : : */
1073 : : state->bs_currRangeStart
1074 : 23 : = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1075 : :
1076 : : /* re-initialize state for it */
1077 : 23 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1078 : : }
1079 : :
1080 : : /* Accumulate the current tuple into the running state */
1081 : 4501 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1082 : : values, isnull);
1083 : 4501 : }
1084 : :
1085 : : /*
1086 : : * brinbuild() -- build a new BRIN index.
1087 : : */
1088 : : IndexBuildResult *
3010 tgl@sss.pgh.pa.us 1089 :CBC 166 : brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
1090 : : {
1091 : : IndexBuildResult *result;
1092 : : double reltuples;
1093 : : double idxtuples;
1094 : : BrinRevmap *revmap;
1095 : : BrinBuildState *state;
1096 : : Buffer meta;
1097 : : BlockNumber pagesPerRange;
1098 : :
1099 : : /*
1100 : : * We expect to be called exactly once for any index relation.
1101 : : */
3446 alvherre@alvh.no-ip. 1102 [ - + ]: 166 : if (RelationGetNumberOfBlocks(index) != 0)
3446 alvherre@alvh.no-ip. 1103 [ # # ]:UBC 0 : elog(ERROR, "index \"%s\" already contains data",
1104 : : RelationGetRelationName(index));
1105 : :
1106 : : /*
1107 : : * Critical section not required, because on error the creation of the
1108 : : * whole relation will be rolled back.
1109 : : */
1110 : :
235 tmunro@postgresql.or 1111 :CBC 166 : meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
1112 : : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
3446 alvherre@alvh.no-ip. 1113 [ - + ]: 166 : Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
1114 : :
2916 kgrittn@postgresql.o 1115 [ + - - + : 166 : brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
+ + ]
1116 : : BRIN_CURRENT_VERSION);
3446 alvherre@alvh.no-ip. 1117 : 166 : MarkBufferDirty(meta);
1118 : :
1119 [ + + + + : 166 : if (RelationNeedsWAL(index))
+ + - + ]
1120 : : {
1121 : : xl_brin_createidx xlrec;
1122 : : XLogRecPtr recptr;
1123 : : Page page;
1124 : :
1125 : 84 : xlrec.version = BRIN_CURRENT_VERSION;
1126 [ + - - + : 84 : xlrec.pagesPerRange = BrinGetPagesPerRange(index);
+ + ]
1127 : :
3433 heikki.linnakangas@i 1128 : 84 : XLogBeginInsert();
1129 : 84 : XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
2355 tgl@sss.pgh.pa.us 1130 : 84 : XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
1131 : :
3433 heikki.linnakangas@i 1132 : 84 : recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1133 : :
2916 kgrittn@postgresql.o 1134 : 84 : page = BufferGetPage(meta);
3446 alvherre@alvh.no-ip. 1135 : 84 : PageSetLSN(page, recptr);
1136 : : }
1137 : :
1138 : 166 : UnlockReleaseBuffer(meta);
1139 : :
1140 : : /*
1141 : : * Initialize our state, including the deformed tuple state.
1142 : : */
219 tmunro@postgresql.or 1143 :GNC 166 : revmap = brinRevmapInitialize(index, &pagesPerRange);
128 tomas.vondra@postgre 1144 : 166 : state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1145 : : RelationGetNumberOfBlocks(heap));
1146 : :
1147 : : /*
1148 : : * Attempt to launch parallel worker scan when required
1149 : : *
1150 : : * XXX plan_create_index_workers makes the number of workers dependent on
1151 : : * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1152 : : * for btree, but not for BRIN, which can do away with much less memory.
1153 : : * So maybe make that somehow less strict, optionally?
1154 : : */
1155 [ + + ]: 166 : if (indexInfo->ii_ParallelWorkers > 0)
1156 : 2 : _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1157 : : indexInfo->ii_ParallelWorkers);
1158 : :
1159 : : /*
1160 : : * If parallel build requested and at least one worker process was
1161 : : * successfully launched, set up coordination state, wait for workers to
1162 : : * complete. Then read all tuples from the shared tuplesort and insert
1163 : : * them into the index.
1164 : : *
1165 : : * In serial mode, simply scan the table and build the index one index
1166 : : * tuple at a time.
1167 : : */
1168 [ + + ]: 166 : if (state->bs_leader)
1169 : : {
1170 : : SortCoordinate coordinate;
1171 : :
1172 : 1 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1173 : 1 : coordinate->isWorker = false;
1174 : 1 : coordinate->nParticipants =
1175 : 1 : state->bs_leader->nparticipanttuplesorts;
1176 : 1 : coordinate->sharedsort = state->bs_leader->sharedsort;
1177 : :
1178 : : /*
1179 : : * Begin leader tuplesort.
1180 : : *
1181 : : * In cases where parallelism is involved, the leader receives the
1182 : : * same share of maintenance_work_mem as a serial sort (it is
1183 : : * generally treated in the same way as a serial sort once we return).
1184 : : * Parallel worker Tuplesortstates will have received only a fraction
1185 : : * of maintenance_work_mem, though.
1186 : : *
1187 : : * We rely on the lifetime of the Leader Tuplesortstate almost not
1188 : : * overlapping with any worker Tuplesortstate's lifetime. There may
1189 : : * be some small overlap, but that's okay because we rely on leader
1190 : : * Tuplesortstate only allocating a small, fixed amount of memory
1191 : : * here. When its tuplesort_performsort() is called (by our caller),
1192 : : * and significant amounts of memory are likely to be used, all
1193 : : * workers must have already freed almost all memory held by their
1194 : : * Tuplesortstates (they are about to go away completely, too). The
1195 : : * overall effect is that maintenance_work_mem always represents an
1196 : : * absolute high watermark on the amount of memory used by a CREATE
1197 : : * INDEX operation, regardless of the use of parallelism or any other
1198 : : * factor.
1199 : : */
106 1200 : 1 : state->bs_sortstate =
1201 : 1 : tuplesort_begin_index_brin(maintenance_work_mem, coordinate,
1202 : : TUPLESORT_NONE);
1203 : :
128 1204 : 1 : _brin_end_parallel(state->bs_leader, state);
1205 : : }
1206 : : else /* no parallel index build */
1207 : : {
1208 : : /*
1209 : : * Now scan the relation. No syncscan allowed here because we want
1210 : : * the heap blocks in physical order (we want to produce the ranges
1211 : : * starting from block 0, and the callback also relies on this to not
1212 : : * generate summary for the same range twice).
1213 : : */
1214 : 165 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1215 : : brinbuildCallback, (void *) state, NULL);
1216 : :
1217 : : /*
1218 : : * process the final batch
1219 : : *
1220 : : * XXX Note this does not update state->bs_currRangeStart, i.e. it
1221 : : * stays set to the last range added to the index. This is OK, because
1222 : : * that's what brin_fill_empty_ranges expects.
1223 : : */
1224 : 165 : form_and_insert_tuple(state);
1225 : :
1226 : : /*
1227 : : * Backfill the final ranges with empty data.
1228 : : *
1229 : : * This saves us from doing what amounts to full table scans when the
1230 : : * index with a predicate like WHERE (nonnull_column IS NULL), or
1231 : : * other very selective predicates.
1232 : : */
1233 : 165 : brin_fill_empty_ranges(state,
1234 : : state->bs_currRangeStart,
1235 : : state->bs_maxRangeStart);
1236 : :
1237 : : /* track the number of relation tuples */
1238 : 165 : state->bs_reltuples = reltuples;
1239 : : }
1240 : :
1241 : : /* release resources */
3446 alvherre@alvh.no-ip. 1242 :CBC 166 : idxtuples = state->bs_numtuples;
128 tomas.vondra@postgre 1243 :GNC 166 : reltuples = state->bs_reltuples;
3446 alvherre@alvh.no-ip. 1244 :CBC 166 : brinRevmapTerminate(state->bs_rmAccess);
1245 : 166 : terminate_brin_buildstate(state);
1246 : :
1247 : : /*
1248 : : * Return statistics
1249 : : */
580 peter@eisentraut.org 1250 : 166 : result = palloc_object(IndexBuildResult);
1251 : :
3446 alvherre@alvh.no-ip. 1252 : 166 : result->heap_tuples = reltuples;
1253 : 166 : result->index_tuples = idxtuples;
1254 : :
3010 tgl@sss.pgh.pa.us 1255 : 166 : return result;
1256 : : }
1257 : :
1258 : : void
1259 : 3 : brinbuildempty(Relation index)
1260 : : {
1261 : : Buffer metabuf;
1262 : :
1263 : : /* An empty BRIN index has a metapage only. */
235 tmunro@postgresql.or 1264 : 3 : metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1265 : : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1266 : :
1267 : : /* Initialize and xlog metabuffer. */
3446 alvherre@alvh.no-ip. 1268 : 3 : START_CRIT_SECTION();
2916 kgrittn@postgresql.o 1269 [ + - - + : 3 : brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
- + ]
1270 : : BRIN_CURRENT_VERSION);
3446 alvherre@alvh.no-ip. 1271 : 3 : MarkBufferDirty(metabuf);
2355 tgl@sss.pgh.pa.us 1272 : 3 : log_newpage_buffer(metabuf, true);
3446 alvherre@alvh.no-ip. 1273 [ - + ]: 3 : END_CRIT_SECTION();
1274 : :
1275 : 3 : UnlockReleaseBuffer(metabuf);
1276 : 3 : }
1277 : :
1278 : : /*
1279 : : * brinbulkdelete
1280 : : * Since there are no per-heap-tuple index tuples in BRIN indexes,
1281 : : * there's not a lot we can do here.
1282 : : *
1283 : : * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1284 : : * tuple is deleted), meaning the need to re-run summarization on the affected
1285 : : * range. Would need to add an extra flag in brintuples for that.
1286 : : */
1287 : : IndexBulkDeleteResult *
3010 tgl@sss.pgh.pa.us 1288 : 8 : brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1289 : : IndexBulkDeleteCallback callback, void *callback_state)
1290 : : {
1291 : : /* allocate stats if first time through, else re-use existing struct */
3446 alvherre@alvh.no-ip. 1292 [ + - ]: 8 : if (stats == NULL)
580 peter@eisentraut.org 1293 : 8 : stats = palloc0_object(IndexBulkDeleteResult);
1294 : :
3010 tgl@sss.pgh.pa.us 1295 : 8 : return stats;
1296 : : }
1297 : :
1298 : : /*
1299 : : * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1300 : : * ranges that are currently unsummarized.
1301 : : */
1302 : : IndexBulkDeleteResult *
1303 : 69 : brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1304 : : {
1305 : : Relation heapRel;
1306 : :
1307 : : /* No-op in ANALYZE ONLY mode */
3446 alvherre@alvh.no-ip. 1308 [ + + ]: 69 : if (info->analyze_only)
3010 tgl@sss.pgh.pa.us 1309 : 2 : return stats;
1310 : :
3446 alvherre@alvh.no-ip. 1311 [ + + ]: 67 : if (!stats)
580 peter@eisentraut.org 1312 : 59 : stats = palloc0_object(IndexBulkDeleteResult);
3446 alvherre@alvh.no-ip. 1313 : 67 : stats->num_pages = RelationGetNumberOfBlocks(info->index);
1314 : : /* rest of stats is initialized by zeroing */
1315 : :
1910 andres@anarazel.de 1316 : 67 : heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1317 : : AccessShareLock);
1318 : :
3168 alvherre@alvh.no-ip. 1319 : 67 : brin_vacuum_scan(info->index, info->strategy);
1320 : :
2354 1321 : 67 : brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1322 : : &stats->num_index_tuples, &stats->num_index_tuples);
1323 : :
1910 andres@anarazel.de 1324 : 67 : table_close(heapRel, AccessShareLock);
1325 : :
3010 tgl@sss.pgh.pa.us 1326 : 67 : return stats;
1327 : : }
1328 : :
1329 : : /*
1330 : : * reloptions processor for BRIN indexes
1331 : : */
1332 : : bytea *
1333 : 473 : brinoptions(Datum reloptions, bool validate)
1334 : : {
1335 : : static const relopt_parse_elt tab[] = {
1336 : : {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1337 : : {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1338 : : };
1339 : :
1622 michael@paquier.xyz 1340 : 473 : return (bytea *) build_reloptions(reloptions, validate,
1341 : : RELOPT_KIND_BRIN,
1342 : : sizeof(BrinOptions),
1343 : : tab, lengthof(tab));
1344 : : }
1345 : :
1346 : : /*
1347 : : * SQL-callable function to scan through an index and summarize all ranges
1348 : : * that are not currently summarized.
1349 : : */
1350 : : Datum
3446 alvherre@alvh.no-ip. 1351 : 38 : brin_summarize_new_values(PG_FUNCTION_ARGS)
1352 : : {
2570 1353 : 38 : Datum relation = PG_GETARG_DATUM(0);
1354 : :
1355 : 38 : return DirectFunctionCall2(brin_summarize_range,
1356 : : relation,
1357 : : Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
1358 : : }
1359 : :
1360 : : /*
1361 : : * SQL-callable function to summarize the indicated page range, if not already
1362 : : * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1363 : : * unsummarized ranges are summarized.
1364 : : */
1365 : : Datum
1366 : 102 : brin_summarize_range(PG_FUNCTION_ARGS)
1367 : : {
3446 1368 : 102 : Oid indexoid = PG_GETARG_OID(0);
2570 1369 : 102 : int64 heapBlk64 = PG_GETARG_INT64(1);
1370 : : BlockNumber heapBlk;
1371 : : Oid heapoid;
1372 : : Relation indexRel;
1373 : : Relation heapRel;
1374 : : Oid save_userid;
1375 : : int save_sec_context;
1376 : : int save_nestlevel;
3446 1377 : 102 : double numSummarized = 0;
1378 : :
2131 1379 [ - + ]: 102 : if (RecoveryInProgress())
2131 alvherre@alvh.no-ip. 1380 [ # # ]:UBC 0 : ereport(ERROR,
1381 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1382 : : errmsg("recovery is in progress"),
1383 : : errhint("BRIN control functions cannot be executed during recovery.")));
1384 : :
2570 alvherre@alvh.no-ip. 1385 [ + + + + ]:CBC 102 : if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1386 [ + - ]: 18 : ereport(ERROR,
1387 : : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1388 : : errmsg("block number out of range: %lld",
1389 : : (long long) heapBlk64)));
1390 : 84 : heapBlk = (BlockNumber) heapBlk64;
1391 : :
1392 : : /*
1393 : : * We must lock table before index to avoid deadlocks. However, if the
1394 : : * passed indexoid isn't an index then IndexGetRelation() will fail.
1395 : : * Rather than emitting a not-very-helpful error message, postpone
1396 : : * complaining, expecting that the is-it-an-index test below will fail.
1397 : : */
3032 tgl@sss.pgh.pa.us 1398 : 84 : heapoid = IndexGetRelation(indexoid, true);
1399 [ + + ]: 84 : if (OidIsValid(heapoid))
1400 : : {
1910 andres@anarazel.de 1401 : 75 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1402 : :
1403 : : /*
1404 : : * Autovacuum calls us. For its benefit, switch to the table owner's
1405 : : * userid, so that any index functions are run as that user. Also
1406 : : * lock down security-restricted operations and arrange to make GUC
1407 : : * variable changes local to this command. This is harmless, albeit
1408 : : * unnecessary, when called from SQL, because we fail shortly if the
1409 : : * user does not own the index.
1410 : : */
706 noah@leadboat.com 1411 : 75 : GetUserIdAndSecContext(&save_userid, &save_sec_context);
1412 : 75 : SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1413 : : save_sec_context | SECURITY_RESTRICTED_OPERATION);
1414 : 75 : save_nestlevel = NewGUCNestLevel();
41 jdavis@postgresql.or 1415 :GNC 75 : RestrictSearchPath();
1416 : : }
1417 : : else
1418 : : {
3032 tgl@sss.pgh.pa.us 1419 :CBC 9 : heapRel = NULL;
1420 : : /* Set these just to suppress "uninitialized variable" warnings */
683 1421 : 9 : save_userid = InvalidOid;
1422 : 9 : save_sec_context = -1;
1423 : 9 : save_nestlevel = -1;
1424 : : }
1425 : :
3446 alvherre@alvh.no-ip. 1426 : 84 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1427 : :
1428 : : /* Must be a BRIN index */
3032 tgl@sss.pgh.pa.us 1429 [ + - ]: 75 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1430 [ + + ]: 75 : indexRel->rd_rel->relam != BRIN_AM_OID)
1431 [ + - ]: 9 : ereport(ERROR,
1432 : : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1433 : : errmsg("\"%s\" is not a BRIN index",
1434 : : RelationGetRelationName(indexRel))));
1435 : :
1436 : : /* User must own the index (comparable to privileges needed for VACUUM) */
518 peter@eisentraut.org 1437 [ + - - + ]: 66 : if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
2325 peter_e@gmx.net 1438 :UBC 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
3032 tgl@sss.pgh.pa.us 1439 : 0 : RelationGetRelationName(indexRel));
1440 : :
1441 : : /*
1442 : : * Since we did the IndexGetRelation call above without any lock, it's
1443 : : * barely possible that a race against an index drop/recreation could have
1444 : : * netted us the wrong table. Recheck.
1445 : : */
3032 tgl@sss.pgh.pa.us 1446 [ + - - + ]:CBC 66 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
3032 tgl@sss.pgh.pa.us 1447 [ # # ]:UBC 0 : ereport(ERROR,
1448 : : (errcode(ERRCODE_UNDEFINED_TABLE),
1449 : : errmsg("could not open parent table of index \"%s\"",
1450 : : RelationGetRelationName(indexRel))));
1451 : :
1452 : : /* see gin_clean_pending_list() */
167 noah@leadboat.com 1453 [ + - ]:CBC 66 : if (indexRel->rd_index->indisvalid)
1454 : 66 : brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1455 : : else
167 noah@leadboat.com 1456 [ # # ]:UBC 0 : ereport(DEBUG1,
1457 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1458 : : errmsg("index \"%s\" is not valid",
1459 : : RelationGetRelationName(indexRel))));
1460 : :
1461 : : /* Roll back any GUC changes executed by index functions */
706 noah@leadboat.com 1462 :CBC 66 : AtEOXact_GUC(false, save_nestlevel);
1463 : :
1464 : : /* Restore userid and security context */
1465 : 66 : SetUserIdAndSecContext(save_userid, save_sec_context);
1466 : :
3446 alvherre@alvh.no-ip. 1467 : 66 : relation_close(indexRel, ShareUpdateExclusiveLock);
1468 : 66 : relation_close(heapRel, ShareUpdateExclusiveLock);
1469 : :
1470 : 66 : PG_RETURN_INT32((int32) numSummarized);
1471 : : }
1472 : :
1473 : : /*
1474 : : * SQL-callable interface to mark a range as no longer summarized
1475 : : */
1476 : : Datum
2570 1477 : 52 : brin_desummarize_range(PG_FUNCTION_ARGS)
1478 : : {
2524 bruce@momjian.us 1479 : 52 : Oid indexoid = PG_GETARG_OID(0);
1480 : 52 : int64 heapBlk64 = PG_GETARG_INT64(1);
1481 : : BlockNumber heapBlk;
1482 : : Oid heapoid;
1483 : : Relation heapRel;
1484 : : Relation indexRel;
1485 : : bool done;
1486 : :
2131 alvherre@alvh.no-ip. 1487 [ - + ]: 52 : if (RecoveryInProgress())
2131 alvherre@alvh.no-ip. 1488 [ # # ]:UBC 0 : ereport(ERROR,
1489 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1490 : : errmsg("recovery is in progress"),
1491 : : errhint("BRIN control functions cannot be executed during recovery.")));
1492 : :
2570 alvherre@alvh.no-ip. 1493 [ + - + + ]:CBC 52 : if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1494 [ + - ]: 9 : ereport(ERROR,
1495 : : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1496 : : errmsg("block number out of range: %lld",
1497 : : (long long) heapBlk64)));
1498 : 43 : heapBlk = (BlockNumber) heapBlk64;
1499 : :
1500 : : /*
1501 : : * We must lock table before index to avoid deadlocks. However, if the
1502 : : * passed indexoid isn't an index then IndexGetRelation() will fail.
1503 : : * Rather than emitting a not-very-helpful error message, postpone
1504 : : * complaining, expecting that the is-it-an-index test below will fail.
1505 : : *
1506 : : * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1507 : : * don't switch userid.
1508 : : */
1509 : 43 : heapoid = IndexGetRelation(indexoid, true);
1510 [ + - ]: 43 : if (OidIsValid(heapoid))
1910 andres@anarazel.de 1511 : 43 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1512 : : else
2570 alvherre@alvh.no-ip. 1513 :UBC 0 : heapRel = NULL;
1514 : :
2570 alvherre@alvh.no-ip. 1515 :CBC 43 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1516 : :
1517 : : /* Must be a BRIN index */
1518 [ + - ]: 43 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1519 [ - + ]: 43 : indexRel->rd_rel->relam != BRIN_AM_OID)
2570 alvherre@alvh.no-ip. 1520 [ # # ]:UBC 0 : ereport(ERROR,
1521 : : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1522 : : errmsg("\"%s\" is not a BRIN index",
1523 : : RelationGetRelationName(indexRel))));
1524 : :
1525 : : /* User must own the index (comparable to privileges needed for VACUUM) */
518 peter@eisentraut.org 1526 [ - + ]:CBC 43 : if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
2325 peter_e@gmx.net 1527 :UBC 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
2570 alvherre@alvh.no-ip. 1528 : 0 : RelationGetRelationName(indexRel));
1529 : :
1530 : : /*
1531 : : * Since we did the IndexGetRelation call above without any lock, it's
1532 : : * barely possible that a race against an index drop/recreation could have
1533 : : * netted us the wrong table. Recheck.
1534 : : */
2570 alvherre@alvh.no-ip. 1535 [ + - - + ]:CBC 43 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
2570 alvherre@alvh.no-ip. 1536 [ # # ]:UBC 0 : ereport(ERROR,
1537 : : (errcode(ERRCODE_UNDEFINED_TABLE),
1538 : : errmsg("could not open parent table of index \"%s\"",
1539 : : RelationGetRelationName(indexRel))));
1540 : :
1541 : : /* see gin_clean_pending_list() */
167 noah@leadboat.com 1542 [ + - ]:CBC 43 : if (indexRel->rd_index->indisvalid)
1543 : : {
1544 : : /* the revmap does the hard work */
1545 : : do
1546 : : {
1547 : 43 : done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1548 : : }
1549 [ - + ]: 43 : while (!done);
1550 : : }
1551 : : else
167 noah@leadboat.com 1552 [ # # ]:UBC 0 : ereport(DEBUG1,
1553 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1554 : : errmsg("index \"%s\" is not valid",
1555 : : RelationGetRelationName(indexRel))));
1556 : :
2570 alvherre@alvh.no-ip. 1557 :CBC 43 : relation_close(indexRel, ShareUpdateExclusiveLock);
1558 : 43 : relation_close(heapRel, ShareUpdateExclusiveLock);
1559 : :
1560 : 43 : PG_RETURN_VOID();
1561 : : }
1562 : :
1563 : : /*
1564 : : * Build a BrinDesc used to create or scan a BRIN index
1565 : : */
1566 : : BrinDesc *
3446 1567 : 2254 : brin_build_desc(Relation rel)
1568 : : {
1569 : : BrinOpcInfo **opcinfo;
1570 : : BrinDesc *bdesc;
1571 : : TupleDesc tupdesc;
1572 : 2254 : int totalstored = 0;
1573 : : int keyno;
1574 : : long totalsize;
1575 : : MemoryContext cxt;
1576 : : MemoryContext oldcxt;
1577 : :
1578 : 2254 : cxt = AllocSetContextCreate(CurrentMemoryContext,
1579 : : "brin desc cxt",
1580 : : ALLOCSET_SMALL_SIZES);
1581 : 2254 : oldcxt = MemoryContextSwitchTo(cxt);
1582 : 2254 : tupdesc = RelationGetDescr(rel);
1583 : :
1584 : : /*
1585 : : * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1586 : : * the number of columns stored, since the number is opclass-defined.
1587 : : */
331 tgl@sss.pgh.pa.us 1588 : 2254 : opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
3446 alvherre@alvh.no-ip. 1589 [ + + ]: 38049 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1590 : : {
1591 : : FmgrInfo *opcInfoFn;
2429 andres@anarazel.de 1592 : 35795 : Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1593 : :
3446 alvherre@alvh.no-ip. 1594 : 35795 : opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1595 : :
1596 : 71590 : opcinfo[keyno] = (BrinOpcInfo *)
2429 andres@anarazel.de 1597 : 35795 : DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
3446 alvherre@alvh.no-ip. 1598 : 35795 : totalstored += opcinfo[keyno]->oi_nstored;
1599 : : }
1600 : :
1601 : : /* Allocate our result struct and fill it in */
1602 : 2254 : totalsize = offsetof(BrinDesc, bd_info) +
1603 : 2254 : sizeof(BrinOpcInfo *) * tupdesc->natts;
1604 : :
1605 : 2254 : bdesc = palloc(totalsize);
1606 : 2254 : bdesc->bd_context = cxt;
1607 : 2254 : bdesc->bd_index = rel;
1608 : 2254 : bdesc->bd_tupdesc = tupdesc;
1609 : 2254 : bdesc->bd_disktdesc = NULL; /* generated lazily */
1610 : 2254 : bdesc->bd_totalstored = totalstored;
1611 : :
1612 [ + + ]: 38049 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1613 : 35795 : bdesc->bd_info[keyno] = opcinfo[keyno];
1614 : 2254 : pfree(opcinfo);
1615 : :
1616 : 2254 : MemoryContextSwitchTo(oldcxt);
1617 : :
1618 : 2254 : return bdesc;
1619 : : }
1620 : :
1621 : : void
1622 : 1708 : brin_free_desc(BrinDesc *bdesc)
1623 : : {
1624 : : /* make sure the tupdesc is still valid */
1625 [ - + ]: 1708 : Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1626 : : /* no need for retail pfree */
1627 : 1708 : MemoryContextDelete(bdesc->bd_context);
1628 : 1708 : }
1629 : :
1630 : : /*
1631 : : * Fetch index's statistical data into *stats
1632 : : */
1633 : : void
2565 1634 : 5365 : brinGetStats(Relation index, BrinStatsData *stats)
1635 : : {
1636 : : Buffer metabuffer;
1637 : : Page metapage;
1638 : : BrinMetaPageData *metadata;
1639 : :
1640 : 5365 : metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1641 : 5365 : LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1642 : 5365 : metapage = BufferGetPage(metabuffer);
1643 : 5365 : metadata = (BrinMetaPageData *) PageGetContents(metapage);
1644 : :
1645 : 5365 : stats->pagesPerRange = metadata->pagesPerRange;
1646 : 5365 : stats->revmapNumPages = metadata->lastRevmapPage - 1;
1647 : :
1648 : 5365 : UnlockReleaseBuffer(metabuffer);
1649 : 5365 : }
1650 : :
1651 : : /*
1652 : : * Initialize a BrinBuildState appropriate to create tuples on the given index.
1653 : : */
1654 : : static BrinBuildState *
3446 1655 : 214 : initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
1656 : : BlockNumber pagesPerRange, BlockNumber tablePages)
1657 : : {
1658 : : BrinBuildState *state;
128 tomas.vondra@postgre 1659 :GNC 214 : BlockNumber lastRange = 0;
1660 : :
580 peter@eisentraut.org 1661 :CBC 214 : state = palloc_object(BrinBuildState);
1662 : :
3446 alvherre@alvh.no-ip. 1663 : 214 : state->bs_irel = idxRel;
1664 : 214 : state->bs_numtuples = 0;
128 tomas.vondra@postgre 1665 :GNC 214 : state->bs_reltuples = 0;
3446 alvherre@alvh.no-ip. 1666 :CBC 214 : state->bs_currentInsertBuf = InvalidBuffer;
1667 : 214 : state->bs_pagesPerRange = pagesPerRange;
1668 : 214 : state->bs_currRangeStart = 0;
1669 : 214 : state->bs_rmAccess = revmap;
1670 : 214 : state->bs_bdesc = brin_build_desc(idxRel);
1671 : 214 : state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
128 tomas.vondra@postgre 1672 :GNC 214 : state->bs_leader = NULL;
1673 : 214 : state->bs_worker_id = 0;
106 1674 : 214 : state->bs_sortstate = NULL;
128 1675 : 214 : state->bs_context = CurrentMemoryContext;
1676 : 214 : state->bs_emptyTuple = NULL;
1677 : 214 : state->bs_emptyTupleLen = 0;
1678 : :
1679 : : /* Remember the memory context to use for an empty tuple, if needed. */
1680 : 214 : state->bs_context = CurrentMemoryContext;
1681 : 214 : state->bs_emptyTuple = NULL;
1682 : 214 : state->bs_emptyTupleLen = 0;
1683 : :
1684 : : /*
1685 : : * Calculate the start of the last page range. Page numbers are 0-based,
1686 : : * so to calculate the index we need to subtract one. The integer division
1687 : : * gives us the index of the page range.
1688 : : */
1689 [ + + ]: 214 : if (tablePages > 0)
1690 : 166 : lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1691 : :
1692 : : /* Now calculate the start of the next range. */
1693 : 214 : state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1694 : :
3446 alvherre@alvh.no-ip. 1695 :CBC 214 : return state;
1696 : : }
1697 : :
1698 : : /*
1699 : : * Release resources associated with a BrinBuildState.
1700 : : */
1701 : : static void
1702 : 211 : terminate_brin_buildstate(BrinBuildState *state)
1703 : : {
1704 : : /*
1705 : : * Release the last index buffer used. We might as well ensure that
1706 : : * whatever free space remains in that page is available in FSM, too.
1707 : : */
1708 [ + + ]: 211 : if (!BufferIsInvalid(state->bs_currentInsertBuf))
1709 : : {
1710 : : Page page;
1711 : : Size freespace;
1712 : : BlockNumber blk;
1713 : :
2916 kgrittn@postgresql.o 1714 : 166 : page = BufferGetPage(state->bs_currentInsertBuf);
2202 tgl@sss.pgh.pa.us 1715 : 166 : freespace = PageGetFreeSpace(page);
1716 : 166 : blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
3446 alvherre@alvh.no-ip. 1717 : 166 : ReleaseBuffer(state->bs_currentInsertBuf);
1804 akapila@postgresql.o 1718 : 166 : RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
2202 tgl@sss.pgh.pa.us 1719 : 166 : FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1720 : : }
1721 : :
3446 alvherre@alvh.no-ip. 1722 : 211 : brin_free_desc(state->bs_bdesc);
1723 : 211 : pfree(state->bs_dtuple);
1724 : 211 : pfree(state);
1725 : 211 : }
1726 : :
1727 : : /*
1728 : : * On the given BRIN index, summarize the heap page range that corresponds
1729 : : * to the heap block number given.
1730 : : *
1731 : : * This routine can run in parallel with insertions into the heap. To avoid
1732 : : * missing those values from the summary tuple, we first insert a placeholder
1733 : : * index tuple into the index, then execute the heap scan; transactions
1734 : : * concurrent with the scan update the placeholder tuple. After the scan, we
1735 : : * union the placeholder tuple with the one computed by this routine. The
1736 : : * update of the index value happens in a loop, so that if somebody updates
1737 : : * the placeholder tuple after we read it, we detect the case and try again.
1738 : : * This ensures that the concurrently inserted tuples are not lost.
1739 : : *
1740 : : * A further corner case is this routine being asked to summarize the partial
1741 : : * range at the end of the table. heapNumBlocks is the (possibly outdated)
1742 : : * table size; if we notice that the requested range lies beyond that size,
1743 : : * we re-compute the table size after inserting the placeholder tuple, to
1744 : : * avoid missing pages that were appended recently.
1745 : : */
1746 : : static void
1747 : 1473 : summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
1748 : : BlockNumber heapBlk, BlockNumber heapNumBlks)
1749 : : {
1750 : : Buffer phbuf;
1751 : : BrinTuple *phtup;
1752 : : Size phsz;
1753 : : OffsetNumber offset;
1754 : : BlockNumber scanNumBlks;
1755 : :
1756 : : /*
1757 : : * Insert the placeholder tuple
1758 : : */
1759 : 1473 : phbuf = InvalidBuffer;
1760 : 1473 : phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1761 : 1473 : offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1762 : : state->bs_rmAccess, &phbuf,
1763 : : heapBlk, phtup, phsz);
1764 : :
1765 : : /*
1766 : : * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1767 : : * cannot shrink concurrently (but it can grow).
1768 : : */
2354 1769 [ - + ]: 1473 : Assert(heapBlk % state->bs_pagesPerRange == 0);
1770 [ + + ]: 1473 : if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1771 : : {
1772 : : /*
1773 : : * If we're asked to scan what we believe to be the final range on the
1774 : : * table (i.e. a range that might be partial) we need to recompute our
1775 : : * idea of what the latest page is after inserting the placeholder
1776 : : * tuple. Anyone that grows the table later will update the
1777 : : * placeholder tuple, so it doesn't matter that we won't scan these
1778 : : * pages ourselves. Careful: the table might have been extended
1779 : : * beyond the current range, so clamp our result.
1780 : : *
1781 : : * Fortunately, this should occur infrequently.
1782 : : */
1783 [ + - ]: 12 : scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1784 : : state->bs_pagesPerRange);
1785 : : }
1786 : : else
1787 : : {
1788 : : /* Easy case: range is known to be complete */
1789 : 1461 : scanNumBlks = state->bs_pagesPerRange;
1790 : : }
1791 : :
1792 : : /*
1793 : : * Execute the partial heap scan covering the heap blocks in the specified
1794 : : * page range, summarizing the heap tuples in it. This scan stops just
1795 : : * short of brinbuildCallback creating the new index entry.
1796 : : *
1797 : : * Note that it is critical we use the "any visible" mode of
1798 : : * table_index_build_range_scan here: otherwise, we would miss tuples
1799 : : * inserted by transactions that are still in progress, among other corner
1800 : : * cases.
1801 : : */
3446 1802 : 1473 : state->bs_currRangeStart = heapBlk;
1839 1803 : 1473 : table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1804 : : heapBlk, scanNumBlks,
1805 : : brinbuildCallback, (void *) state, NULL);
1806 : :
1807 : : /*
1808 : : * Now we update the values obtained by the scan with the placeholder
1809 : : * tuple. We do this in a loop which only terminates if we're able to
1810 : : * update the placeholder tuple successfully; if we are not, this means
1811 : : * somebody else modified the placeholder tuple after we read it.
1812 : : */
1813 : : for (;;)
3446 alvherre@alvh.no-ip. 1814 :UBC 0 : {
1815 : : BrinTuple *newtup;
1816 : : Size newsize;
1817 : : bool didupdate;
1818 : : bool samepage;
1819 : :
3446 alvherre@alvh.no-ip. 1820 [ - + ]:CBC 1473 : CHECK_FOR_INTERRUPTS();
1821 : :
1822 : : /*
1823 : : * Update the summary tuple and try to update.
1824 : : */
1825 : 1473 : newtup = brin_form_tuple(state->bs_bdesc,
1826 : : heapBlk, state->bs_dtuple, &newsize);
1827 : 1473 : samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1828 : : didupdate =
1829 : 1473 : brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1830 : : state->bs_rmAccess, heapBlk, phbuf, offset,
1831 : : phtup, phsz, newtup, newsize, samepage);
1832 : 1473 : brin_free_tuple(phtup);
1833 : 1473 : brin_free_tuple(newtup);
1834 : :
1835 : : /* If the update succeeded, we're done. */
1836 [ + - ]: 1473 : if (didupdate)
1837 : 1473 : break;
1838 : :
1839 : : /*
1840 : : * If the update didn't work, it might be because somebody updated the
1841 : : * placeholder tuple concurrently. Extract the new version, union it
1842 : : * with the values we have from the scan, and start over. (There are
1843 : : * other reasons for the update to fail, but it's simple to treat them
1844 : : * the same.)
1845 : : */
3446 alvherre@alvh.no-ip. 1846 :UBC 0 : phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1847 : : &offset, &phsz, BUFFER_LOCK_SHARE);
1848 : : /* the placeholder tuple must exist */
1849 [ # # ]: 0 : if (phtup == NULL)
1850 [ # # ]: 0 : elog(ERROR, "missing placeholder tuple");
2564 1851 : 0 : phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
3446 1852 : 0 : LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
1853 : :
1854 : : /* merge it into the tuple from the heap scan */
1855 : 0 : union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1856 : : }
1857 : :
3446 alvherre@alvh.no-ip. 1858 :CBC 1473 : ReleaseBuffer(phbuf);
1859 : 1473 : }
1860 : :
1861 : : /*
1862 : : * Summarize page ranges that are not already summarized. If pageRange is
1863 : : * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1864 : : * page range containing the given heap page number is scanned.
1865 : : * If include_partial is true, then the partial range at the end of the table
1866 : : * is summarized, otherwise not.
1867 : : *
1868 : : * For each new index tuple inserted, *numSummarized (if not NULL) is
1869 : : * incremented; for each existing tuple, *numExisting (if not NULL) is
1870 : : * incremented.
1871 : : */
1872 : : static void
2570 1873 : 133 : brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
1874 : : bool include_partial, double *numSummarized, double *numExisting)
1875 : : {
1876 : : BrinRevmap *revmap;
3446 1877 : 133 : BrinBuildState *state = NULL;
1878 : 133 : IndexInfo *indexInfo = NULL;
1879 : : BlockNumber heapNumBlocks;
1880 : : BlockNumber pagesPerRange;
1881 : : Buffer buf;
1882 : : BlockNumber startBlk;
1883 : :
219 tmunro@postgresql.or 1884 :GNC 133 : revmap = brinRevmapInitialize(index, &pagesPerRange);
1885 : :
1886 : : /* determine range of pages to process */
2354 alvherre@alvh.no-ip. 1887 :CBC 133 : heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
2570 1888 [ + + ]: 133 : if (pageRange == BRIN_ALL_BLOCKRANGES)
1889 : 96 : startBlk = 0;
1890 : : else
1891 : : {
1892 : 37 : startBlk = (pageRange / pagesPerRange) * pagesPerRange;
2354 1893 : 37 : heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1894 : : }
1895 [ - + ]: 133 : if (startBlk > heapNumBlocks)
1896 : : {
1897 : : /* Nothing to do if start point is beyond end of table */
2354 alvherre@alvh.no-ip. 1898 :UBC 0 : brinRevmapTerminate(revmap);
1899 : 0 : return;
1900 : : }
1901 : :
1902 : : /*
1903 : : * Scan the revmap to find unsummarized items.
1904 : : */
3446 alvherre@alvh.no-ip. 1905 :CBC 133 : buf = InvalidBuffer;
2354 1906 [ + + ]: 10171 : for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1907 : : {
1908 : : BrinTuple *tup;
1909 : : OffsetNumber off;
1910 : :
1911 : : /*
1912 : : * Unless requested to summarize even a partial range, go away now if
1913 : : * we think the next range is partial. Caller would pass true when it
1914 : : * is typically run once bulk data loading is done
1915 : : * (brin_summarize_new_values), and false when it is typically the
1916 : : * result of arbitrarily-scheduled maintenance command (vacuuming).
1917 : : */
1918 [ + + ]: 10089 : if (!include_partial &&
1919 [ + + ]: 1718 : (startBlk + pagesPerRange > heapNumBlocks))
1920 : 51 : break;
1921 : :
3446 1922 [ - + ]: 10038 : CHECK_FOR_INTERRUPTS();
1923 : :
2354 1924 : 10038 : tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1925 : : BUFFER_LOCK_SHARE);
3446 1926 [ + + ]: 10038 : if (tup == NULL)
1927 : : {
1928 : : /* no revmap entry for this heap range. Summarize it. */
1929 [ + + ]: 1473 : if (state == NULL)
1930 : : {
1931 : : /* first time through */
1932 [ - + ]: 45 : Assert(!indexInfo);
1933 : 45 : state = initialize_brin_buildstate(index, revmap,
1934 : : pagesPerRange,
1935 : : InvalidBlockNumber);
1936 : 45 : indexInfo = BuildIndexInfo(index);
1937 : : }
2354 1938 : 1473 : summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1939 : :
1940 : : /* and re-initialize state for the next range */
3446 1941 : 1473 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1942 : :
1943 [ + - ]: 1473 : if (numSummarized)
1944 : 1473 : *numSummarized += 1.0;
1945 : : }
1946 : : else
1947 : : {
1948 [ + + ]: 8565 : if (numExisting)
1949 : 1615 : *numExisting += 1.0;
1950 : 8565 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1951 : : }
1952 : : }
1953 : :
1954 [ + + ]: 133 : if (BufferIsValid(buf))
1955 : 100 : ReleaseBuffer(buf);
1956 : :
1957 : : /* free resources */
1958 : 133 : brinRevmapTerminate(revmap);
1959 [ + + ]: 133 : if (state)
1960 : : {
1961 : 45 : terminate_brin_buildstate(state);
3175 1962 : 45 : pfree(indexInfo);
1963 : : }
1964 : : }
1965 : :
1966 : : /*
1967 : : * Given a deformed tuple in the build state, convert it into the on-disk
1968 : : * format and insert it into the index, making the revmap point to it.
1969 : : */
1970 : : static void
3446 1971 : 1299 : form_and_insert_tuple(BrinBuildState *state)
1972 : : {
1973 : : BrinTuple *tup;
1974 : : Size size;
1975 : :
1976 : 1299 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1977 : : state->bs_dtuple, &size);
1978 : 1299 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1979 : : &state->bs_currentInsertBuf, state->bs_currRangeStart,
1980 : : tup, size);
1981 : 1299 : state->bs_numtuples++;
1982 : :
1983 : 1299 : pfree(tup);
1984 : 1299 : }
1985 : :
1986 : : /*
1987 : : * Given a deformed tuple in the build state, convert it into the on-disk
1988 : : * format and write it to a (shared) tuplesort (the leader will insert it
1989 : : * into the index later).
1990 : : */
1991 : : static void
128 tomas.vondra@postgre 1992 :GNC 27 : form_and_spill_tuple(BrinBuildState *state)
1993 : : {
1994 : : BrinTuple *tup;
1995 : : Size size;
1996 : :
1997 : : /* don't insert empty tuples in parallel build */
1998 [ + + ]: 27 : if (state->bs_dtuple->bt_empty_range)
1999 : 3 : return;
2000 : :
2001 : 24 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2002 : : state->bs_dtuple, &size);
2003 : :
2004 : : /* write the BRIN tuple to the tuplesort */
106 2005 : 24 : tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2006 : :
128 2007 : 24 : state->bs_numtuples++;
2008 : :
2009 : 24 : pfree(tup);
2010 : : }
2011 : :
2012 : : /*
2013 : : * Given two deformed tuples, adjust the first one so that it's consistent
2014 : : * with the summary values in both.
2015 : : */
2016 : : static void
3446 alvherre@alvh.no-ip. 2017 :GBC 8 : union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
2018 : : {
2019 : : int keyno;
2020 : : BrinMemTuple *db;
2021 : : MemoryContext cxt;
2022 : : MemoryContext oldcxt;
2023 : :
2024 : : /* Use our own memory context to avoid retail pfree */
2025 : 8 : cxt = AllocSetContextCreate(CurrentMemoryContext,
2026 : : "brin union",
2027 : : ALLOCSET_DEFAULT_SIZES);
2028 : 8 : oldcxt = MemoryContextSwitchTo(cxt);
2564 2029 : 8 : db = brin_deform_tuple(bdesc, b, NULL);
3446 2030 : 8 : MemoryContextSwitchTo(oldcxt);
2031 : :
2032 : : /*
2033 : : * Check if the ranges are empty.
2034 : : *
2035 : : * If at least one of them is empty, we don't need to call per-key union
2036 : : * functions at all. If "b" is empty, we just use "a" as the result (it
2037 : : * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2038 : : * we use "b" as the result (but we have to copy the data into "a" first).
2039 : : *
2040 : : * Only when both ranges are non-empty, we actually do the per-key merge.
2041 : : */
2042 : :
2043 : : /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
331 tomas.vondra@postgre 2044 [ - + ]: 8 : if (db->bt_empty_range)
2045 : : {
2046 : : /* skip the per-key merge */
331 tomas.vondra@postgre 2047 :UBC 0 : MemoryContextDelete(cxt);
2048 : 0 : return;
2049 : : }
2050 : :
2051 : : /*
2052 : : * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2053 : : * But we need to copy the data from "b" to "a" first, because that's how
2054 : : * we pass result out.
2055 : : *
2056 : : * We have to copy all the global/per-key flags etc. too.
2057 : : */
331 tomas.vondra@postgre 2058 [ - + ]:GBC 8 : if (a->bt_empty_range)
2059 : : {
331 tomas.vondra@postgre 2060 [ # # ]:UBC 0 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2061 : : {
2062 : : int i;
2063 : 0 : BrinValues *col_a = &a->bt_columns[keyno];
2064 : 0 : BrinValues *col_b = &db->bt_columns[keyno];
2065 : 0 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2066 : :
2067 : 0 : col_a->bv_allnulls = col_b->bv_allnulls;
2068 : 0 : col_a->bv_hasnulls = col_b->bv_hasnulls;
2069 : :
2070 : : /* If "b" has no data, we're done. */
2071 [ # # ]: 0 : if (col_b->bv_allnulls)
2072 : 0 : continue;
2073 : :
2074 [ # # ]: 0 : for (i = 0; i < opcinfo->oi_nstored; i++)
2075 : 0 : col_a->bv_values[i] =
2076 : 0 : datumCopy(col_b->bv_values[i],
2077 : 0 : opcinfo->oi_typcache[i]->typbyval,
2078 : 0 : opcinfo->oi_typcache[i]->typlen);
2079 : : }
2080 : :
2081 : : /* "a" started empty, but "b" was not empty, so remember that */
2082 : 0 : a->bt_empty_range = false;
2083 : :
2084 : : /* skip the per-key merge */
2085 : 0 : MemoryContextDelete(cxt);
2086 : 0 : return;
2087 : : }
2088 : :
2089 : : /* Now we know neither range is empty. */
3446 alvherre@alvh.no-ip. 2090 [ + + ]:GBC 40 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2091 : : {
2092 : : FmgrInfo *unionFn;
2093 : 32 : BrinValues *col_a = &a->bt_columns[keyno];
2094 : 32 : BrinValues *col_b = &db->bt_columns[keyno];
1118 tomas.vondra@postgre 2095 : 32 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2096 : :
2097 [ + - ]: 32 : if (opcinfo->oi_regular_nulls)
2098 : : {
2099 : : /* Does the "b" summary represent any NULL values? */
332 2100 [ + + - + ]: 32 : bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2101 : :
2102 : : /* Adjust "hasnulls". */
2103 [ + + + + ]: 32 : if (!col_a->bv_allnulls && b_has_nulls)
1118 2104 : 23 : col_a->bv_hasnulls = true;
2105 : :
2106 : : /* If there are no values in B, there's nothing left to do. */
2107 [ - + ]: 32 : if (col_b->bv_allnulls)
1118 tomas.vondra@postgre 2108 :UBC 0 : continue;
2109 : :
2110 : : /*
2111 : : * Adjust "allnulls". If A doesn't have values, just copy the
2112 : : * values from B into A, and we're done. We cannot run the
2113 : : * operators in this case, because values in A might contain
2114 : : * garbage. Note we already established that B contains values.
2115 : : *
2116 : : * Also adjust "hasnulls" in order not to forget the summary
2117 : : * represents NULL values. This is not redundant with the earlier
2118 : : * update, because that only happens when allnulls=false.
2119 : : */
1118 tomas.vondra@postgre 2120 [ + + ]:GBC 32 : if (col_a->bv_allnulls)
2121 : 2 : {
2122 : : int i;
2123 : :
2124 : 2 : col_a->bv_allnulls = false;
332 2125 : 2 : col_a->bv_hasnulls = true;
2126 : :
1118 2127 [ + + ]: 5 : for (i = 0; i < opcinfo->oi_nstored; i++)
2128 : 3 : col_a->bv_values[i] =
2129 : 3 : datumCopy(col_b->bv_values[i],
2130 : 3 : opcinfo->oi_typcache[i]->typbyval,
2131 : 3 : opcinfo->oi_typcache[i]->typlen);
2132 : :
2133 : 2 : continue;
2134 : : }
2135 : : }
2136 : :
3446 alvherre@alvh.no-ip. 2137 : 30 : unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2138 : : BRIN_PROCNUM_UNION);
2139 : 30 : FunctionCall3Coll(unionFn,
2140 : 30 : bdesc->bd_index->rd_indcollation[keyno],
2141 : : PointerGetDatum(bdesc),
2142 : : PointerGetDatum(col_a),
2143 : : PointerGetDatum(col_b));
2144 : : }
2145 : :
2146 : 8 : MemoryContextDelete(cxt);
2147 : : }
2148 : :
2149 : : /*
2150 : : * brin_vacuum_scan
2151 : : * Do a complete scan of the index during VACUUM.
2152 : : *
2153 : : * This routine scans the complete index looking for uncataloged index pages,
2154 : : * i.e. those that might have been lost due to a crash after index extension
2155 : : * and such.
2156 : : */
2157 : : static void
3168 alvherre@alvh.no-ip. 2158 :CBC 67 : brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
2159 : : {
2160 : : BlockNumber nblocks;
2161 : : BlockNumber blkno;
2162 : :
2163 : : /*
2164 : : * Scan the index in physical order, and clean up any possible mess in
2165 : : * each page.
2166 : : */
2202 tgl@sss.pgh.pa.us 2167 : 67 : nblocks = RelationGetNumberOfBlocks(idxrel);
2168 [ + + ]: 358 : for (blkno = 0; blkno < nblocks; blkno++)
2169 : : {
2170 : : Buffer buf;
2171 : :
3168 alvherre@alvh.no-ip. 2172 [ - + ]: 291 : CHECK_FOR_INTERRUPTS();
2173 : :
2174 : 291 : buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2175 : : RBM_NORMAL, strategy);
2176 : :
2202 tgl@sss.pgh.pa.us 2177 : 291 : brin_page_cleanup(idxrel, buf);
2178 : :
3168 alvherre@alvh.no-ip. 2179 : 291 : ReleaseBuffer(buf);
2180 : : }
2181 : :
2182 : : /*
2183 : : * Update all upper pages in the index's FSM, as well. This ensures not
2184 : : * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2185 : : * but also that any pre-existing damage or out-of-dateness is repaired.
2186 : : */
2202 tgl@sss.pgh.pa.us 2187 : 67 : FreeSpaceMapVacuum(idxrel);
3168 alvherre@alvh.no-ip. 2188 : 67 : }
2189 : :
2190 : : static bool
1118 tomas.vondra@postgre 2191 : 393651 : add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
2192 : : const Datum *values, const bool *nulls)
2193 : : {
2194 : : int keyno;
2195 : :
2196 : : /* If the range starts empty, we're certainly going to modify it. */
331 2197 : 393651 : bool modified = dtup->bt_empty_range;
2198 : :
2199 : : /*
2200 : : * Compare the key values of the new tuple to the stored index values; our
2201 : : * deformed tuple will get updated if the new tuple doesn't fit the
2202 : : * original range (note this means we can't break out of the loop early).
2203 : : * Make a note of whether this happens, so that we know to insert the
2204 : : * modified tuple later.
2205 : : */
1118 2206 [ + + ]: 932175 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2207 : : {
2208 : : Datum result;
2209 : : BrinValues *bval;
2210 : : FmgrInfo *addValue;
2211 : : bool has_nulls;
2212 : :
2213 : 538524 : bval = &dtup->bt_columns[keyno];
2214 : :
2215 : : /*
2216 : : * Does the range have actual NULL values? Either of the flags can be
2217 : : * set, but we ignore the state before adding first row.
2218 : : *
2219 : : * We have to remember this, because we'll modify the flags and we
2220 : : * need to know if the range started as empty.
2221 : : */
331 2222 [ + + ]: 1058660 : has_nulls = ((!dtup->bt_empty_range) &&
2223 [ + + + + ]: 520136 : (bval->bv_hasnulls || bval->bv_allnulls));
2224 : :
2225 : : /*
2226 : : * If the value we're adding is NULL, handle it locally. Otherwise
2227 : : * call the BRIN_PROCNUM_ADDVALUE procedure.
2228 : : */
1118 2229 [ + - + + ]: 538524 : if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2230 : : {
2231 : : /*
2232 : : * If the new value is null, we record that we saw it if it's the
2233 : : * first one; otherwise, there's nothing to do.
2234 : : */
2235 [ + + ]: 13757 : if (!bval->bv_hasnulls)
2236 : : {
2237 : 1947 : bval->bv_hasnulls = true;
2238 : 1947 : modified = true;
2239 : : }
2240 : :
2241 : 13757 : continue;
2242 : : }
2243 : :
2244 : 524767 : addValue = index_getprocinfo(idxRel, keyno + 1,
2245 : : BRIN_PROCNUM_ADDVALUE);
2246 : 524767 : result = FunctionCall4Coll(addValue,
2247 : 524767 : idxRel->rd_indcollation[keyno],
2248 : : PointerGetDatum(bdesc),
2249 : : PointerGetDatum(bval),
2250 : 524767 : values[keyno],
2251 : 524767 : nulls[keyno]);
2252 : : /* if that returned true, we need to insert the updated tuple */
2253 : 524767 : modified |= DatumGetBool(result);
2254 : :
2255 : : /*
2256 : : * If the range was had actual NULL values (i.e. did not start empty),
2257 : : * make sure we don't forget about the NULL values. Either the
2258 : : * allnulls flag is still set to true, or (if the opclass cleared it)
2259 : : * we need to set hasnulls=true.
2260 : : *
2261 : : * XXX This can only happen when the opclass modified the tuple, so
2262 : : * the modified flag should be set.
2263 : : */
331 2264 [ + + + + : 524767 : if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
+ - ]
2265 : : {
2266 [ - + ]: 2 : Assert(modified);
2267 : 2 : bval->bv_hasnulls = true;
2268 : : }
2269 : : }
2270 : :
2271 : : /*
2272 : : * After updating summaries for all the keys, mark it as not empty.
2273 : : *
2274 : : * If we're actually changing the flag value (i.e. tuple started as
2275 : : * empty), we should have modified the tuple. So we should not see empty
2276 : : * range that was not modified.
2277 : : */
2278 [ + + - + ]: 393651 : Assert(!dtup->bt_empty_range || modified);
2279 : 393651 : dtup->bt_empty_range = false;
2280 : :
1118 2281 : 393651 : return modified;
2282 : : }
2283 : :
2284 : : static bool
2285 : 94968 : check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2286 : : {
2287 : : int keyno;
2288 : :
2289 : : /*
2290 : : * First check if there are any IS [NOT] NULL scan keys, and if we're
2291 : : * violating them.
2292 : : */
2293 [ + + ]: 95586 : for (keyno = 0; keyno < nnullkeys; keyno++)
2294 : : {
2295 : 1116 : ScanKey key = nullkeys[keyno];
2296 : :
2297 [ - + ]: 1116 : Assert(key->sk_attno == bval->bv_attno);
2298 : :
2299 : : /* Handle only IS NULL/IS NOT NULL tests */
2300 [ - + ]: 1116 : if (!(key->sk_flags & SK_ISNULL))
1118 tomas.vondra@postgre 2301 :UBC 0 : continue;
2302 : :
1118 tomas.vondra@postgre 2303 [ + + ]:CBC 1116 : if (key->sk_flags & SK_SEARCHNULL)
2304 : : {
2305 : : /* IS NULL scan key, but range has no NULLs */
2306 [ + + + + ]: 558 : if (!bval->bv_allnulls && !bval->bv_hasnulls)
2307 : 489 : return false;
2308 : : }
2309 [ + - ]: 558 : else if (key->sk_flags & SK_SEARCHNOTNULL)
2310 : : {
2311 : : /*
2312 : : * For IS NOT NULL, we can only skip ranges that are known to have
2313 : : * only nulls.
2314 : : */
2315 [ + + ]: 558 : if (bval->bv_allnulls)
2316 : 9 : return false;
2317 : : }
2318 : : else
2319 : : {
2320 : : /*
2321 : : * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2322 : : * operators are strict and thus return false with NULL value in
2323 : : * the scan key.
2324 : : */
1118 tomas.vondra@postgre 2325 :UBC 0 : return false;
2326 : : }
2327 : : }
2328 : :
1118 tomas.vondra@postgre 2329 :CBC 94470 : return true;
2330 : : }
2331 : :
2332 : : static void
128 tomas.vondra@postgre 2333 :GNC 2 : _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
2334 : : bool isconcurrent, int request)
2335 : : {
2336 : : ParallelContext *pcxt;
2337 : : int scantuplesortstates;
2338 : : Snapshot snapshot;
2339 : : Size estbrinshared;
2340 : : Size estsort;
2341 : : BrinShared *brinshared;
2342 : : Sharedsort *sharedsort;
2343 : 2 : BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2344 : : WalUsage *walusage;
2345 : : BufferUsage *bufferusage;
2346 : 2 : bool leaderparticipates = true;
2347 : : int querylen;
2348 : :
2349 : : #ifdef DISABLE_LEADER_PARTICIPATION
2350 : : leaderparticipates = false;
2351 : : #endif
2352 : :
2353 : : /*
2354 : : * Enter parallel mode, and create context for parallel build of brin
2355 : : * index
2356 : : */
2357 : 2 : EnterParallelMode();
2358 [ - + ]: 2 : Assert(request > 0);
2359 : 2 : pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2360 : : request);
2361 : :
2362 [ + - ]: 2 : scantuplesortstates = leaderparticipates ? request + 1 : request;
2363 : :
2364 : : /*
2365 : : * Prepare for scan of the base relation. In a normal index build, we use
2366 : : * SnapshotAny because we must retrieve all tuples and do our own time
2367 : : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2368 : : * concurrent build, we take a regular MVCC snapshot and index whatever's
2369 : : * live according to that.
2370 : : */
2371 [ + - ]: 2 : if (!isconcurrent)
2372 : 2 : snapshot = SnapshotAny;
2373 : : else
128 tomas.vondra@postgre 2374 :UNC 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
2375 : :
2376 : : /*
2377 : : * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2378 : : */
128 tomas.vondra@postgre 2379 :GNC 2 : estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2380 : 2 : shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2381 : 2 : estsort = tuplesort_estimate_shared(scantuplesortstates);
2382 : 2 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2383 : :
2384 : 2 : shm_toc_estimate_keys(&pcxt->estimator, 2);
2385 : :
2386 : : /*
2387 : : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2388 : : * and PARALLEL_KEY_BUFFER_USAGE.
2389 : : *
2390 : : * If there are no extensions loaded that care, we could skip this. We
2391 : : * have no way of knowing whether anyone's looking at pgWalUsage or
2392 : : * pgBufferUsage, so do it unconditionally.
2393 : : */
2394 : 2 : shm_toc_estimate_chunk(&pcxt->estimator,
2395 : : mul_size(sizeof(WalUsage), pcxt->nworkers));
2396 : 2 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2397 : 2 : shm_toc_estimate_chunk(&pcxt->estimator,
2398 : : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2399 : 2 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2400 : :
2401 : : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2402 [ + - ]: 2 : if (debug_query_string)
2403 : : {
2404 : 2 : querylen = strlen(debug_query_string);
2405 : 2 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2406 : 2 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2407 : : }
2408 : : else
128 tomas.vondra@postgre 2409 :UNC 0 : querylen = 0; /* keep compiler quiet */
2410 : :
2411 : : /* Everyone's had a chance to ask for space, so now create the DSM */
128 tomas.vondra@postgre 2412 :GNC 2 : InitializeParallelDSM(pcxt);
2413 : :
2414 : : /* If no DSM segment was available, back out (do serial build) */
2415 [ - + ]: 2 : if (pcxt->seg == NULL)
2416 : : {
128 tomas.vondra@postgre 2417 [ # # # # ]:UNC 0 : if (IsMVCCSnapshot(snapshot))
2418 : 0 : UnregisterSnapshot(snapshot);
2419 : 0 : DestroyParallelContext(pcxt);
2420 : 0 : ExitParallelMode();
2421 : 0 : return;
2422 : : }
2423 : :
2424 : : /* Store shared build state, for which we reserved space */
128 tomas.vondra@postgre 2425 :GNC 2 : brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2426 : : /* Initialize immutable state */
2427 : 2 : brinshared->heaprelid = RelationGetRelid(heap);
2428 : 2 : brinshared->indexrelid = RelationGetRelid(index);
2429 : 2 : brinshared->isconcurrent = isconcurrent;
2430 : 2 : brinshared->scantuplesortstates = scantuplesortstates;
2431 : 2 : brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2432 : 2 : ConditionVariableInit(&brinshared->workersdonecv);
2433 : 2 : SpinLockInit(&brinshared->mutex);
2434 : :
2435 : : /* Initialize mutable state */
2436 : 2 : brinshared->nparticipantsdone = 0;
2437 : 2 : brinshared->reltuples = 0.0;
2438 : 2 : brinshared->indtuples = 0.0;
2439 : :
2440 : 2 : table_parallelscan_initialize(heap,
2441 : : ParallelTableScanFromBrinShared(brinshared),
2442 : : snapshot);
2443 : :
2444 : : /*
2445 : : * Store shared tuplesort-private state, for which we reserved space.
2446 : : * Then, initialize opaque state using tuplesort routine.
2447 : : */
2448 : 2 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2449 : 2 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2450 : : pcxt->seg);
2451 : :
2452 : : /*
2453 : : * Store shared tuplesort-private state, for which we reserved space.
2454 : : * Then, initialize opaque state using tuplesort routine.
2455 : : */
2456 : 2 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2457 : 2 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2458 : :
2459 : : /* Store query string for workers */
2460 [ + - ]: 2 : if (debug_query_string)
2461 : : {
2462 : : char *sharedquery;
2463 : :
2464 : 2 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2465 : 2 : memcpy(sharedquery, debug_query_string, querylen + 1);
2466 : 2 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2467 : : }
2468 : :
2469 : : /*
2470 : : * Allocate space for each worker's WalUsage and BufferUsage; no need to
2471 : : * initialize.
2472 : : */
2473 : 2 : walusage = shm_toc_allocate(pcxt->toc,
2474 : 2 : mul_size(sizeof(WalUsage), pcxt->nworkers));
2475 : 2 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2476 : 2 : bufferusage = shm_toc_allocate(pcxt->toc,
2477 : 2 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2478 : 2 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2479 : :
2480 : : /* Launch workers, saving status for leader/caller */
2481 : 2 : LaunchParallelWorkers(pcxt);
2482 : 2 : brinleader->pcxt = pcxt;
2483 : 2 : brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2484 [ + - ]: 2 : if (leaderparticipates)
2485 : 2 : brinleader->nparticipanttuplesorts++;
2486 : 2 : brinleader->brinshared = brinshared;
2487 : 2 : brinleader->sharedsort = sharedsort;
2488 : 2 : brinleader->snapshot = snapshot;
2489 : 2 : brinleader->walusage = walusage;
2490 : 2 : brinleader->bufferusage = bufferusage;
2491 : :
2492 : : /* If no workers were successfully launched, back out (do serial build) */
2493 [ + + ]: 2 : if (pcxt->nworkers_launched == 0)
2494 : : {
2495 : 1 : _brin_end_parallel(brinleader, NULL);
2496 : 1 : return;
2497 : : }
2498 : :
2499 : : /* Save leader state now that it's clear build will be parallel */
2500 : 1 : buildstate->bs_leader = brinleader;
2501 : :
2502 : : /* Join heap scan ourselves */
2503 [ + - ]: 1 : if (leaderparticipates)
2504 : 1 : _brin_leader_participate_as_worker(buildstate, heap, index);
2505 : :
2506 : : /*
2507 : : * Caller needs to wait for all launched workers when we return. Make
2508 : : * sure that the failure-to-start case will not hang forever.
2509 : : */
2510 : 1 : WaitForParallelWorkersToAttach(pcxt);
2511 : : }
2512 : :
2513 : : /*
2514 : : * Shut down workers, destroy parallel context, and end parallel mode.
2515 : : */
2516 : : static void
2517 : 2 : _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
2518 : : {
2519 : : int i;
2520 : : BrinTuple *btup;
2521 : 2 : BrinMemTuple *memtuple = NULL;
2522 : : Size tuplen;
2523 : 2 : BrinShared *brinshared = brinleader->brinshared;
2524 : 2 : BlockNumber prevblkno = InvalidBlockNumber;
2525 : : MemoryContext rangeCxt,
2526 : : oldCxt;
2527 : :
2528 : : /* Shutdown worker processes */
2529 : 2 : WaitForParallelWorkersToFinish(brinleader->pcxt);
2530 : :
2531 : : /*
2532 : : * If we didn't actually launch workers, we still have to make sure to
2533 : : * exit parallel mode.
2534 : : */
2535 [ + + ]: 2 : if (!state)
2536 : 1 : goto cleanup;
2537 : :
2538 : : /* copy the data into leader state (we have to wait for the workers ) */
2539 : 1 : state->bs_reltuples = brinshared->reltuples;
2540 : 1 : state->bs_numtuples = brinshared->indtuples;
2541 : :
2542 : : /* do the actual sort in the leader */
106 2543 : 1 : tuplesort_performsort(state->bs_sortstate);
2544 : :
2545 : : /*
2546 : : * Initialize BrinMemTuple we'll use to union summaries from workers (in
2547 : : * case they happened to produce parts of the same paga range).
2548 : : */
128 2549 : 1 : memtuple = brin_new_memtuple(state->bs_bdesc);
2550 : :
2551 : : /*
2552 : : * Create a memory context we'll reset to combine results for a single
2553 : : * page range (received from the workers). We don't expect huge number of
2554 : : * overlaps under regular circumstances, because for large tables the
2555 : : * chunk size is likely larger than the BRIN page range), but it can
2556 : : * happen, and the union functions may do all kinds of stuff. So we better
2557 : : * reset the context once in a while.
2558 : : */
2559 : 1 : rangeCxt = AllocSetContextCreate(CurrentMemoryContext,
2560 : : "brin union",
2561 : : ALLOCSET_DEFAULT_SIZES);
2562 : 1 : oldCxt = MemoryContextSwitchTo(rangeCxt);
2563 : :
2564 : : /*
2565 : : * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2566 : : * That probably gives us an index that is cheaper to scan, thanks to
2567 : : * mostly getting data from the same index page as before.
2568 : : */
106 2569 [ + + ]: 25 : while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2570 : : {
2571 : : /* Ranges should be multiples of pages_per_range for the index. */
128 2572 [ - + ]: 24 : Assert(btup->bt_blkno % brinshared->pagesPerRange == 0);
2573 : :
2574 : : /*
2575 : : * Do we need to union summaries for the same page range?
2576 : : *
2577 : : * If this is the first brin tuple we read, then just deform it into
2578 : : * the memtuple, and continue with the next one from tuplesort. We
2579 : : * however may need to insert empty summaries into the index.
2580 : : *
2581 : : * If it's the same block as the last we saw, we simply union the brin
2582 : : * tuple into it, and we're done - we don't even need to insert empty
2583 : : * ranges, because that was done earlier when we saw the first brin
2584 : : * tuple (for this range).
2585 : : *
2586 : : * Finally, if it's not the first brin tuple, and it's not the same
2587 : : * page range, we need to do the insert and then deform the tuple into
2588 : : * the memtuple. Then we'll insert empty ranges before the new brin
2589 : : * tuple, if needed.
2590 : : */
2591 [ + + ]: 24 : if (prevblkno == InvalidBlockNumber)
2592 : : {
2593 : : /* First brin tuples, just deform into memtuple. */
2594 : 1 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2595 : :
2596 : : /* continue to insert empty pages before thisblock */
2597 : : }
2598 [ + + ]: 23 : else if (memtuple->bt_blkno == btup->bt_blkno)
2599 : : {
2600 : : /*
2601 : : * Not the first brin tuple, but same page range as the previous
2602 : : * one, so we can merge it into the memtuple.
2603 : : */
2604 : 8 : union_tuples(state->bs_bdesc, memtuple, btup);
2605 : 8 : continue;
2606 : : }
2607 : : else
2608 : : {
2609 : : BrinTuple *tmp;
2610 : : Size len;
2611 : :
2612 : : /*
2613 : : * We got brin tuple for a different page range, so form a brin
2614 : : * tuple from the memtuple, insert it, and re-init the memtuple
2615 : : * from the new brin tuple.
2616 : : */
2617 : 15 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2618 : : memtuple, &len);
2619 : :
2620 : 15 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2621 : : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2622 : :
2623 : : /*
2624 : : * Reset the per-output-range context. This frees all the memory
2625 : : * possibly allocated by the union functions, and also the BRIN
2626 : : * tuple we just formed and inserted.
2627 : : */
2628 : 15 : MemoryContextReset(rangeCxt);
2629 : :
2630 : 15 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2631 : :
2632 : : /* continue to insert empty pages before thisblock */
2633 : : }
2634 : :
2635 : : /* Fill empty ranges for all ranges missing in the tuplesort. */
2636 : 16 : brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2637 : :
2638 : 16 : prevblkno = btup->bt_blkno;
2639 : : }
2640 : :
106 2641 : 1 : tuplesort_end(state->bs_sortstate);
2642 : :
2643 : : /* Fill the BRIN tuple for the last page range with data. */
128 2644 [ + - ]: 1 : if (prevblkno != InvalidBlockNumber)
2645 : : {
2646 : : BrinTuple *tmp;
2647 : : Size len;
2648 : :
2649 : 1 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2650 : : memtuple, &len);
2651 : :
2652 : 1 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2653 : : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2654 : :
2655 : 1 : pfree(tmp);
2656 : : }
2657 : :
2658 : : /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2659 : 1 : brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2660 : :
2661 : : /*
2662 : : * Switch back to the original memory context, and destroy the one we
2663 : : * created to isolate the union_tuple calls.
2664 : : */
2665 : 1 : MemoryContextSwitchTo(oldCxt);
2666 : 1 : MemoryContextDelete(rangeCxt);
2667 : :
2668 : : /*
2669 : : * Next, accumulate WAL usage. (This must wait for the workers to finish,
2670 : : * or we might get incomplete data.)
2671 : : */
2672 [ + + ]: 4 : for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2673 : 3 : InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2674 : :
2675 : 1 : cleanup:
2676 : :
2677 : : /* Free last reference to MVCC snapshot, if one was used */
2678 [ + - - + ]: 2 : if (IsMVCCSnapshot(brinleader->snapshot))
128 tomas.vondra@postgre 2679 :UNC 0 : UnregisterSnapshot(brinleader->snapshot);
128 tomas.vondra@postgre 2680 :GNC 2 : DestroyParallelContext(brinleader->pcxt);
2681 : 2 : ExitParallelMode();
2682 : 2 : }
2683 : :
2684 : : /*
2685 : : * Returns size of shared memory required to store state for a parallel
2686 : : * brin index build based on the snapshot its parallel scan will use.
2687 : : */
2688 : : static Size
2689 : 2 : _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
2690 : : {
2691 : : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2692 : 2 : return add_size(BUFFERALIGN(sizeof(BrinShared)),
2693 : : table_parallelscan_estimate(heap, snapshot));
2694 : : }
2695 : :
2696 : : /*
2697 : : * Within leader, participate as a parallel worker.
2698 : : */
2699 : : static void
2700 : 1 : _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
2701 : : {
2702 : 1 : BrinLeader *brinleader = buildstate->bs_leader;
2703 : : int sortmem;
2704 : :
2705 : : /*
2706 : : * Might as well use reliable figure when doling out maintenance_work_mem
2707 : : * (when requested number of workers were not launched, this will be
2708 : : * somewhat higher than it is for other workers).
2709 : : */
2710 : 1 : sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2711 : :
2712 : : /* Perform work common to all participants */
106 2713 : 1 : _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2714 : : brinleader->sharedsort, heap, index, sortmem, true);
128 2715 : 1 : }
2716 : :
2717 : : /*
2718 : : * Perform a worker's portion of a parallel sort.
2719 : : *
2720 : : * This generates a tuplesort for the worker portion of the table.
2721 : : *
2722 : : * sortmem is the amount of working memory to use within each worker,
2723 : : * expressed in KBs.
2724 : : *
2725 : : * When this returns, workers are done, and need only release resources.
2726 : : */
2727 : : static void
106 2728 : 4 : _brin_parallel_scan_and_build(BrinBuildState *state,
2729 : : BrinShared *brinshared, Sharedsort *sharedsort,
2730 : : Relation heap, Relation index,
2731 : : int sortmem, bool progress)
2732 : : {
2733 : : SortCoordinate coordinate;
2734 : : TableScanDesc scan;
2735 : : double reltuples;
2736 : : IndexInfo *indexInfo;
2737 : :
2738 : : /* Initialize local tuplesort coordination state */
128 2739 : 4 : coordinate = palloc0(sizeof(SortCoordinateData));
2740 : 4 : coordinate->isWorker = true;
2741 : 4 : coordinate->nParticipants = -1;
2742 : 4 : coordinate->sharedsort = sharedsort;
2743 : :
2744 : : /* Begin "partial" tuplesort */
106 2745 : 4 : state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2746 : : TUPLESORT_NONE);
2747 : :
2748 : : /* Join parallel scan */
128 2749 : 4 : indexInfo = BuildIndexInfo(index);
2750 : 4 : indexInfo->ii_Concurrent = brinshared->isconcurrent;
2751 : :
2752 : 4 : scan = table_beginscan_parallel(heap,
2753 : : ParallelTableScanFromBrinShared(brinshared));
2754 : :
2755 : 4 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2756 : : brinbuildCallbackParallel, state, scan);
2757 : :
2758 : : /* insert the last item */
2759 : 4 : form_and_spill_tuple(state);
2760 : :
2761 : : /* sort the BRIN ranges built by this worker */
106 2762 : 4 : tuplesort_performsort(state->bs_sortstate);
2763 : :
128 2764 : 4 : state->bs_reltuples += reltuples;
2765 : :
2766 : : /*
2767 : : * Done. Record ambuild statistics.
2768 : : */
2769 [ - + ]: 4 : SpinLockAcquire(&brinshared->mutex);
2770 : 4 : brinshared->nparticipantsdone++;
2771 : 4 : brinshared->reltuples += state->bs_reltuples;
2772 : 4 : brinshared->indtuples += state->bs_numtuples;
2773 : 4 : SpinLockRelease(&brinshared->mutex);
2774 : :
2775 : : /* Notify leader */
2776 : 4 : ConditionVariableSignal(&brinshared->workersdonecv);
2777 : :
106 2778 : 4 : tuplesort_end(state->bs_sortstate);
128 2779 : 4 : }
2780 : :
2781 : : /*
2782 : : * Perform work within a launched parallel process.
2783 : : */
2784 : : void
2785 : 3 : _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2786 : : {
2787 : : char *sharedquery;
2788 : : BrinShared *brinshared;
2789 : : Sharedsort *sharedsort;
2790 : : BrinBuildState *buildstate;
2791 : : Relation heapRel;
2792 : : Relation indexRel;
2793 : : LOCKMODE heapLockmode;
2794 : : LOCKMODE indexLockmode;
2795 : : WalUsage *walusage;
2796 : : BufferUsage *bufferusage;
2797 : : int sortmem;
2798 : :
2799 : : /*
2800 : : * The only possible status flag that can be set to the parallel worker is
2801 : : * PROC_IN_SAFE_IC.
2802 : : */
2803 [ - + - - ]: 3 : Assert((MyProc->statusFlags == 0) ||
2804 : : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2805 : :
2806 : : /* Set debug_query_string for individual workers first */
2807 : 3 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2808 : 3 : debug_query_string = sharedquery;
2809 : :
2810 : : /* Report the query string from leader */
2811 : 3 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2812 : :
2813 : : /* Look up brin shared state */
2814 : 3 : brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2815 : :
2816 : : /* Open relations using lock modes known to be obtained by index.c */
2817 [ + - ]: 3 : if (!brinshared->isconcurrent)
2818 : : {
2819 : 3 : heapLockmode = ShareLock;
2820 : 3 : indexLockmode = AccessExclusiveLock;
2821 : : }
2822 : : else
2823 : : {
128 tomas.vondra@postgre 2824 :UNC 0 : heapLockmode = ShareUpdateExclusiveLock;
2825 : 0 : indexLockmode = RowExclusiveLock;
2826 : : }
2827 : :
2828 : : /* Open relations within worker */
128 tomas.vondra@postgre 2829 :GNC 3 : heapRel = table_open(brinshared->heaprelid, heapLockmode);
2830 : 3 : indexRel = index_open(brinshared->indexrelid, indexLockmode);
2831 : :
2832 : 3 : buildstate = initialize_brin_buildstate(indexRel, NULL,
2833 : : brinshared->pagesPerRange,
2834 : : InvalidBlockNumber);
2835 : :
2836 : : /* Look up shared state private to tuplesort.c */
2837 : 3 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2838 : 3 : tuplesort_attach_shared(sharedsort, seg);
2839 : :
2840 : : /* Prepare to track buffer usage during parallel execution */
2841 : 3 : InstrStartParallelQuery();
2842 : :
2843 : : /*
2844 : : * Might as well use reliable figure when doling out maintenance_work_mem
2845 : : * (when requested number of workers were not launched, this will be
2846 : : * somewhat higher than it is for other workers).
2847 : : */
2848 : 3 : sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2849 : :
106 2850 : 3 : _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2851 : : heapRel, indexRel, sortmem, false);
2852 : :
2853 : : /* Report WAL/buffer usage during parallel execution */
128 2854 : 3 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2855 : 3 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2856 : 3 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2857 : 3 : &walusage[ParallelWorkerNumber]);
2858 : :
2859 : 3 : index_close(indexRel, indexLockmode);
2860 : 3 : table_close(heapRel, heapLockmode);
2861 : 3 : }
2862 : :
2863 : : /*
2864 : : * brin_build_empty_tuple
2865 : : * Maybe initialize a BRIN tuple representing empty range.
2866 : : *
2867 : : * Returns a BRIN tuple representing an empty page range starting at the
2868 : : * specified block number. The empty tuple is initialized only once, when it's
2869 : : * needed for the first time, stored in the memory context bs_context to ensure
2870 : : * proper life span, and reused on following calls. All empty tuples are
2871 : : * exactly the same except for the bs_blkno field, which is set to the value
2872 : : * in blkno parameter.
2873 : : */
2874 : : static void
2875 : 4 : brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
2876 : : {
2877 : : /* First time an empty tuple is requested? If yes, initialize it. */
2878 [ + + ]: 4 : if (state->bs_emptyTuple == NULL)
2879 : : {
2880 : : MemoryContext oldcxt;
2881 : 1 : BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2882 : :
2883 : : /* Allocate the tuple in context for the whole index build. */
2884 : 1 : oldcxt = MemoryContextSwitchTo(state->bs_context);
2885 : :
2886 : 1 : state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2887 : : &state->bs_emptyTupleLen);
2888 : :
2889 : 1 : MemoryContextSwitchTo(oldcxt);
2890 : : }
2891 : : else
2892 : : {
2893 : : /* If we already have an empty tuple, just update the block. */
2894 : 3 : state->bs_emptyTuple->bt_blkno = blkno;
2895 : : }
2896 : 4 : }
2897 : :
2898 : : /*
2899 : : * brin_fill_empty_ranges
2900 : : * Add BRIN index tuples representing empty page ranges.
2901 : : *
2902 : : * prevRange/nextRange determine for which page ranges to add empty summaries.
2903 : : * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2904 : : * (prevRange < blkno < nextRange) will be added to the index.
2905 : : *
2906 : : * If prevRange is InvalidBlockNumber, this means there was no previous page
2907 : : * range (i.e. the first empty range to add is for blkno=0).
2908 : : *
2909 : : * The empty tuple is built only once, and then reused for all future calls.
2910 : : */
2911 : : static void
2912 : 182 : brin_fill_empty_ranges(BrinBuildState *state,
2913 : : BlockNumber prevRange, BlockNumber nextRange)
2914 : : {
2915 : : BlockNumber blkno;
2916 : :
2917 : : /*
2918 : : * If we already summarized some ranges, we need to start with the next
2919 : : * one. Otherwise start from the first range of the table.
2920 : : */
2921 [ + + ]: 182 : blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2922 : :
2923 : : /* Generate empty ranges until we hit the next non-empty range. */
2924 [ + + ]: 186 : while (blkno < nextRange)
2925 : : {
2926 : : /* Did we already build the empty tuple? If not, do it now. */
2927 : 4 : brin_build_empty_tuple(state, blkno);
2928 : :
2929 : 4 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2930 : : &state->bs_currentInsertBuf,
2931 : : blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
2932 : :
2933 : : /* try next page range */
2934 : 4 : blkno += state->bs_pagesPerRange;
2935 : : }
2936 : 182 : }
|