Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * It is not wise to pack the pages entirely full, since then *any*
18 : * insertion would cause a split (and not only of the leaf page; the need
19 : * for a split would cascade right up the tree). The steady-state load
20 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : * are always packed to 70%. This gives us reasonable density (there aren't
23 : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : * cascading splits during early insertions.
25 : *
26 : * Formerly the index pages being built were kept in shared buffers, but
27 : * that is of no value (since other backends have no interest in them yet)
28 : * and it created locking problems for CHECKPOINT, because the upper-level
29 : * pages were held exclusive-locked for long periods. Now we just build
30 : * the pages in local memory and smgrwrite or smgrextend them as we finish
31 : * them. They will need to be re-read into shared buffers on first use after
32 : * the build finishes.
33 : *
34 : * This code isn't concerned about the FSM at all. The caller is responsible
35 : * for initializing that.
36 : *
37 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
38 : * Portions Copyright (c) 1994, Regents of the University of California
39 : *
40 : * IDENTIFICATION
41 : * src/backend/access/nbtree/nbtsort.c
42 : *
43 : *-------------------------------------------------------------------------
44 : */
45 :
46 : #include "postgres.h"
47 :
48 : #include "access/nbtree.h"
49 : #include "access/parallel.h"
50 : #include "access/relscan.h"
51 : #include "access/table.h"
52 : #include "access/xact.h"
53 : #include "access/xlog.h"
54 : #include "access/xloginsert.h"
55 : #include "catalog/index.h"
56 : #include "commands/progress.h"
57 : #include "executor/instrument.h"
58 : #include "miscadmin.h"
59 : #include "pgstat.h"
60 : #include "storage/smgr.h"
61 : #include "tcop/tcopprot.h" /* pgrminclude ignore */
62 : #include "utils/rel.h"
63 : #include "utils/sortsupport.h"
64 : #include "utils/tuplesort.h"
65 :
66 :
67 : /* Magic numbers for parallel state sharing */
68 : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
69 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
70 : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
71 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
72 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
73 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
74 :
75 : /*
76 : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
77 : * parallel index builds. This may be useful as a debugging aid.
78 : #undef DISABLE_LEADER_PARTICIPATION
79 : */
80 :
81 : /*
82 : * Status record for spooling/sorting phase. (Note we may have two of
83 : * these due to the special requirements for uniqueness-checking with
84 : * dead tuples.)
85 : */
86 : typedef struct BTSpool
87 : {
88 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
89 : Relation heap;
90 : Relation index;
91 : bool isunique;
92 : bool nulls_not_distinct;
93 : } BTSpool;
94 :
95 : /*
96 : * Status for index builds performed in parallel. This is allocated in a
97 : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
98 : * entry, private to tuplesort.c but allocated by this module on its behalf.
99 : */
100 : typedef struct BTShared
101 : {
102 : /*
103 : * These fields are not modified during the sort. They primarily exist
104 : * for the benefit of worker processes that need to create BTSpool state
105 : * corresponding to that used by the leader.
106 : */
107 : Oid heaprelid;
108 : Oid indexrelid;
109 : bool isunique;
110 : bool nulls_not_distinct;
111 : bool isconcurrent;
112 : int scantuplesortstates;
113 :
114 : /*
115 : * workersdonecv is used to monitor the progress of workers. All parallel
116 : * participants must indicate that they are done before leader can use
117 : * mutable state that workers maintain during scan (and before leader can
118 : * proceed to tuplesort_performsort()).
119 : */
120 : ConditionVariable workersdonecv;
121 :
122 : /*
123 : * mutex protects all fields before heapdesc.
124 : *
125 : * These fields contain status information of interest to B-Tree index
126 : * builds that must work just the same when an index is built in parallel.
127 : */
128 : slock_t mutex;
129 :
130 : /*
131 : * Mutable state that is maintained by workers, and reported back to
132 : * leader at end of parallel scan.
133 : *
134 : * nparticipantsdone is number of worker processes finished.
135 : *
136 : * reltuples is the total number of input heap tuples.
137 : *
138 : * havedead indicates if RECENTLY_DEAD tuples were encountered during
139 : * build.
140 : *
141 : * indtuples is the total number of tuples that made it into the index.
142 : *
143 : * brokenhotchain indicates if any worker detected a broken HOT chain
144 : * during build.
145 : */
146 : int nparticipantsdone;
147 : double reltuples;
148 : bool havedead;
149 : double indtuples;
150 : bool brokenhotchain;
151 :
152 : /*
153 : * ParallelTableScanDescData data follows. Can't directly embed here, as
154 : * implementations of the parallel table scan desc interface might need
155 : * stronger alignment.
156 : */
157 : } BTShared;
158 :
159 : /*
160 : * Return pointer to a BTShared's parallel table scan.
161 : *
162 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
163 : * MAXALIGN.
164 : */
165 : #define ParallelTableScanFromBTShared(shared) \
166 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
167 :
168 : /*
169 : * Status for leader in parallel index build.
170 : */
171 : typedef struct BTLeader
172 : {
173 : /* parallel context itself */
174 : ParallelContext *pcxt;
175 :
176 : /*
177 : * nparticipanttuplesorts is the exact number of worker processes
178 : * successfully launched, plus one leader process if it participates as a
179 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
180 : * participating as a worker).
181 : */
182 : int nparticipanttuplesorts;
183 :
184 : /*
185 : * Leader process convenience pointers to shared state (leader avoids TOC
186 : * lookups).
187 : *
188 : * btshared is the shared state for entire build. sharedsort is the
189 : * shared, tuplesort-managed state passed to each process tuplesort.
190 : * sharedsort2 is the corresponding btspool2 shared state, used only when
191 : * building unique indexes. snapshot is the snapshot used by the scan iff
192 : * an MVCC snapshot is required.
193 : */
194 : BTShared *btshared;
195 : Sharedsort *sharedsort;
196 : Sharedsort *sharedsort2;
197 : Snapshot snapshot;
198 : WalUsage *walusage;
199 : BufferUsage *bufferusage;
200 : } BTLeader;
201 :
202 : /*
203 : * Working state for btbuild and its callback.
204 : *
205 : * When parallel CREATE INDEX is used, there is a BTBuildState for each
206 : * participant.
207 : */
208 : typedef struct BTBuildState
209 : {
210 : bool isunique;
211 : bool nulls_not_distinct;
212 : bool havedead;
213 : Relation heap;
214 : BTSpool *spool;
215 :
216 : /*
217 : * spool2 is needed only when the index is a unique index. Dead tuples are
218 : * put into spool2 instead of spool in order to avoid uniqueness check.
219 : */
220 : BTSpool *spool2;
221 : double indtuples;
222 :
223 : /*
224 : * btleader is only present when a parallel index build is performed, and
225 : * only in the leader process. (Actually, only the leader has a
226 : * BTBuildState. Workers have their own spool and spool2, though.)
227 : */
228 : BTLeader *btleader;
229 : } BTBuildState;
230 :
231 : /*
232 : * Status record for a btree page being built. We have one of these
233 : * for each active tree level.
234 : */
235 : typedef struct BTPageState
236 : {
237 : Page btps_page; /* workspace for page building */
238 : BlockNumber btps_blkno; /* block # to write this page at */
239 : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
240 : OffsetNumber btps_lastoff; /* last item offset loaded */
241 : Size btps_lastextra; /* last item's extra posting list space */
242 : uint32 btps_level; /* tree level (0 = leaf) */
243 : Size btps_full; /* "full" if less than this much free space */
244 : struct BTPageState *btps_next; /* link to parent level, if any */
245 : } BTPageState;
246 :
247 : /*
248 : * Overall status record for index writing phase.
249 : */
250 : typedef struct BTWriteState
251 : {
252 : Relation heap;
253 : Relation index;
254 : BTScanInsert inskey; /* generic insertion scankey */
255 : bool btws_use_wal; /* dump pages to WAL? */
256 : BlockNumber btws_pages_alloced; /* # pages allocated */
257 : BlockNumber btws_pages_written; /* # pages written out */
258 : Page btws_zeropage; /* workspace for filling zeroes */
259 : } BTWriteState;
260 :
261 :
262 : static double _bt_spools_heapscan(Relation heap, Relation index,
263 : BTBuildState *buildstate, IndexInfo *indexInfo);
264 : static void _bt_spooldestroy(BTSpool *btspool);
265 : static void _bt_spool(BTSpool *btspool, ItemPointer self,
266 : Datum *values, bool *isnull);
267 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
268 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
269 : bool *isnull, bool tupleIsAlive, void *state);
270 : static Page _bt_blnewpage(uint32 level);
271 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
272 : static void _bt_slideleft(Page rightmostpage);
273 : static void _bt_sortaddtup(Page page, Size itemsize,
274 : IndexTuple itup, OffsetNumber itup_off,
275 : bool newfirstdataitem);
276 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
277 : IndexTuple itup, Size truncextra);
278 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
279 : BTPageState *state,
280 : BTDedupState dstate);
281 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
282 : static void _bt_load(BTWriteState *wstate,
283 : BTSpool *btspool, BTSpool *btspool2);
284 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
285 : int request);
286 : static void _bt_end_parallel(BTLeader *btleader);
287 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
288 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
289 : bool *brokenhotchain);
290 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
291 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
292 : BTShared *btshared, Sharedsort *sharedsort,
293 : Sharedsort *sharedsort2, int sortmem,
294 : bool progress);
295 :
296 :
297 : /*
298 : * btbuild() -- build a new btree index.
299 : */
300 : IndexBuildResult *
1892 rhaas 301 CBC 64158 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
302 : {
303 : IndexBuildResult *result;
304 : BTBuildState buildstate;
305 : double reltuples;
306 :
307 : #ifdef BTREE_BUILD_STATS
308 : if (log_btree_build_stats)
309 : ResetUsage();
310 : #endif /* BTREE_BUILD_STATS */
311 :
312 64158 : buildstate.isunique = indexInfo->ii_Unique;
430 peter 313 64158 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
1892 rhaas 314 64158 : buildstate.havedead = false;
315 64158 : buildstate.heap = heap;
316 64158 : buildstate.spool = NULL;
317 64158 : buildstate.spool2 = NULL;
318 64158 : buildstate.indtuples = 0;
319 64158 : buildstate.btleader = NULL;
320 :
321 : /*
322 : * We expect to be called exactly once for any index relation. If that's
323 : * not the case, big trouble's what we have.
324 : */
325 64158 : if (RelationGetNumberOfBlocks(index) != 0)
1892 rhaas 326 UBC 0 : elog(ERROR, "index \"%s\" already contains data",
327 : RelationGetRelationName(index));
328 :
1892 rhaas 329 CBC 64158 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
330 :
331 : /*
332 : * Finish the build by (1) completing the sort of the spool file, (2)
333 : * inserting the sorted tuples into btree pages and (3) building the upper
334 : * levels. Finally, it may also be necessary to end use of parallelism.
335 : */
336 64155 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
337 64113 : _bt_spooldestroy(buildstate.spool);
338 64113 : if (buildstate.spool2)
339 13 : _bt_spooldestroy(buildstate.spool2);
340 64113 : if (buildstate.btleader)
341 71 : _bt_end_parallel(buildstate.btleader);
342 :
343 64113 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
344 :
345 64113 : result->heap_tuples = reltuples;
346 64113 : result->index_tuples = buildstate.indtuples;
347 :
348 : #ifdef BTREE_BUILD_STATS
349 : if (log_btree_build_stats)
350 : {
351 : ShowUsage("BTREE BUILD STATS");
352 : ResetUsage();
353 : }
354 : #endif /* BTREE_BUILD_STATS */
355 :
356 64113 : return result;
357 : }
358 :
359 : /*
360 : * Create and initialize one or two spool structures, and save them in caller's
361 : * buildstate argument. May also fill-in fields within indexInfo used by index
362 : * builds.
363 : *
364 : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
365 : * routine encapsulates all aspects of managing parallelism. Caller need only
366 : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
367 : *
368 : * Returns the total number of heap tuples scanned.
369 : */
370 : static double
371 64158 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
372 : IndexInfo *indexInfo)
373 : {
7452 bruce 374 64158 : BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1892 rhaas 375 64158 : SortCoordinate coordinate = NULL;
376 64158 : double reltuples = 0;
377 :
378 : /*
379 : * We size the sort area as maintenance_work_mem rather than work_mem to
380 : * speed index creation. This should be OK since a single backend can't
381 : * run multiple index creations in parallel (see also: notes on
382 : * parallelism and maintenance_work_mem below).
383 : */
3722 tgl 384 64158 : btspool->heap = heap;
8575 385 64158 : btspool->index = index;
1892 rhaas 386 64158 : btspool->isunique = indexInfo->ii_Unique;
430 peter 387 64158 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
388 :
389 : /* Save as primary spool */
1892 rhaas 390 64158 : buildstate->spool = btspool;
391 :
392 : /* Report table scan phase started */
1468 alvherre 393 64158 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
394 : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
395 :
396 : /* Attempt to launch parallel worker scan when required */
1892 rhaas 397 64158 : if (indexInfo->ii_ParallelWorkers > 0)
398 71 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
399 : indexInfo->ii_ParallelWorkers);
400 :
401 : /*
402 : * If parallel build requested and at least one worker process was
403 : * successfully launched, set up coordination state
404 : */
405 64158 : if (buildstate->btleader)
406 : {
407 71 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
408 71 : coordinate->isWorker = false;
409 71 : coordinate->nParticipants =
410 71 : buildstate->btleader->nparticipanttuplesorts;
411 71 : coordinate->sharedsort = buildstate->btleader->sharedsort;
412 : }
413 :
414 : /*
415 : * Begin serial/leader tuplesort.
416 : *
417 : * In cases where parallelism is involved, the leader receives the same
418 : * share of maintenance_work_mem as a serial sort (it is generally treated
419 : * in the same way as a serial sort once we return). Parallel worker
420 : * Tuplesortstates will have received only a fraction of
421 : * maintenance_work_mem, though.
422 : *
423 : * We rely on the lifetime of the Leader Tuplesortstate almost not
424 : * overlapping with any worker Tuplesortstate's lifetime. There may be
425 : * some small overlap, but that's okay because we rely on leader
426 : * Tuplesortstate only allocating a small, fixed amount of memory here.
427 : * When its tuplesort_performsort() is called (by our caller), and
428 : * significant amounts of memory are likely to be used, all workers must
429 : * have already freed almost all memory held by their Tuplesortstates
430 : * (they are about to go away completely, too). The overall effect is
431 : * that maintenance_work_mem always represents an absolute high watermark
432 : * on the amount of memory used by a CREATE INDEX operation, regardless of
433 : * the use of parallelism or any other factor.
434 : */
435 128316 : buildstate->spool->sortstate =
436 64158 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
430 peter 437 64158 : buildstate->nulls_not_distinct,
438 : maintenance_work_mem, coordinate,
439 : TUPLESORT_NONE);
440 :
441 : /*
442 : * If building a unique index, put dead tuples in a second spool to keep
443 : * them out of the uniqueness check. We expect that the second spool (for
444 : * dead tuples) won't get very full, so we give it only work_mem.
445 : */
1892 rhaas 446 64158 : if (indexInfo->ii_Unique)
447 : {
448 56640 : BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
449 56640 : SortCoordinate coordinate2 = NULL;
450 :
451 : /* Initialize secondary spool */
452 56640 : btspool2->heap = heap;
453 56640 : btspool2->index = index;
454 56640 : btspool2->isunique = false;
455 : /* Save as secondary spool */
456 56640 : buildstate->spool2 = btspool2;
457 :
458 56640 : if (buildstate->btleader)
459 : {
460 : /*
461 : * Set up non-private state that is passed to
462 : * tuplesort_begin_index_btree() about the basic high level
463 : * coordination of a parallel sort.
464 : */
465 32 : coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
466 32 : coordinate2->isWorker = false;
467 32 : coordinate2->nParticipants =
468 32 : buildstate->btleader->nparticipanttuplesorts;
469 32 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
470 : }
471 :
472 : /*
473 : * We expect that the second one (for dead tuples) won't get very
474 : * full, so we give it only work_mem
475 : */
476 56640 : buildstate->spool2->sortstate =
430 peter 477 56640 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
478 : coordinate2, TUPLESORT_NONE);
479 : }
480 :
481 : /* Fill spool using either serial or parallel heap scan */
1892 rhaas 482 64158 : if (!buildstate->btleader)
1468 alvherre 483 64087 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
484 : _bt_build_callback, (void *) buildstate,
485 : NULL);
486 : else
1892 rhaas 487 71 : reltuples = _bt_parallel_heapscan(buildstate,
488 : &indexInfo->ii_BrokenHotChain);
489 :
490 : /*
491 : * Set the progress target for the next phase. Reset the block number
492 : * values set by table_index_build_scan
493 : */
494 : {
796 peter 495 64155 : const int progress_index[] = {
496 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
497 : PROGRESS_SCAN_BLOCKS_TOTAL,
498 : PROGRESS_SCAN_BLOCKS_DONE
499 : };
500 64155 : const int64 progress_vals[] = {
1468 alvherre 501 64155 : buildstate->indtuples,
502 : 0, 0
503 : };
504 :
796 peter 505 64155 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
506 : }
507 :
508 : /* okay, all heap tuples are spooled */
1892 rhaas 509 64155 : if (buildstate->spool2 && !buildstate->havedead)
510 : {
511 : /* spool2 turns out to be unnecessary */
512 56627 : _bt_spooldestroy(buildstate->spool2);
513 56627 : buildstate->spool2 = NULL;
514 : }
515 :
516 64155 : return reltuples;
517 : }
518 :
519 : /*
520 : * clean up a spool structure and its substructures.
521 : */
522 : static void
8575 tgl 523 120753 : _bt_spooldestroy(BTSpool *btspool)
524 : {
525 120753 : tuplesort_end(btspool->sortstate);
6768 neilc 526 120753 : pfree(btspool);
9770 scrappy 527 120753 : }
528 :
529 : /*
530 : * spool an index entry into the sort file.
531 : */
532 : static void
3204 rhaas 533 11769706 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
534 : {
535 11769706 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
536 : self, values, isnull);
9770 scrappy 537 11769706 : }
538 :
539 : /*
540 : * given a spool loaded by successive calls to _bt_spool,
541 : * create an entire btree.
542 : */
543 : static void
8277 inoue 544 64155 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
545 : {
546 : BTWriteState wstate;
547 :
548 : #ifdef BTREE_BUILD_STATS
549 : if (log_btree_build_stats)
550 : {
551 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
552 : ResetUsage();
553 : }
554 : #endif /* BTREE_BUILD_STATS */
555 :
556 : /* Execute the sort */
1468 alvherre 557 64155 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
558 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
7351 tgl 559 64155 : tuplesort_performsort(btspool->sortstate);
8277 inoue 560 64113 : if (btspool2)
561 : {
1468 alvherre 562 13 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
563 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
8277 inoue 564 13 : tuplesort_performsort(btspool2->sortstate);
565 : }
566 :
3722 tgl 567 64113 : wstate.heap = btspool->heap;
6885 568 64113 : wstate.index = btspool->index;
8 andres 569 GNC 64113 : wstate.inskey = _bt_mkscankey(wstate.index, btspool->heap, NULL);
570 : /* _bt_mkscankey() won't set allequalimage without metapage */
1138 pg 571 CBC 64113 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
1100 noah 572 64113 : wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
573 :
574 : /* reserve the metapage */
6885 tgl 575 64113 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
576 64113 : wstate.btws_pages_written = 0;
6797 bruce 577 64113 : wstate.btws_zeropage = NULL; /* until needed */
578 :
1468 alvherre 579 64113 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
580 : PROGRESS_BTREE_PHASE_LEAF_LOAD);
6885 tgl 581 64113 : _bt_load(&wstate, btspool, btspool2);
9770 scrappy 582 64113 : }
583 :
584 : /*
585 : * Per-tuple callback for table_index_build_scan
586 : */
587 : static void
1892 rhaas 588 11769706 : _bt_build_callback(Relation index,
589 : ItemPointer tid,
590 : Datum *values,
591 : bool *isnull,
592 : bool tupleIsAlive,
593 : void *state)
594 : {
595 11769706 : BTBuildState *buildstate = (BTBuildState *) state;
596 :
597 : /*
598 : * insert the index tuple into the appropriate spool file for subsequent
599 : * processing
600 : */
601 11769706 : if (tupleIsAlive || buildstate->spool2 == NULL)
1248 andres 602 11769481 : _bt_spool(buildstate->spool, tid, values, isnull);
603 : else
604 : {
605 : /* dead tuples are put into spool2 */
1892 rhaas 606 225 : buildstate->havedead = true;
1248 andres 607 225 : _bt_spool(buildstate->spool2, tid, values, isnull);
608 : }
609 :
1892 rhaas 610 11769706 : buildstate->indtuples += 1;
611 11769706 : }
612 :
613 : /*
614 : * allocate workspace for a new, clean btree page, not linked to any siblings.
615 : */
616 : static Page
6885 tgl 617 62011 : _bt_blnewpage(uint32 level)
618 : {
619 : Page page;
620 : BTPageOpaque opaque;
621 :
1 tmunro 622 GNC 62011 : page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
623 :
624 : /* Zero the page and set up standard page header info */
6885 tgl 625 CBC 62011 : _bt_pageinit(page, BLCKSZ);
626 :
627 : /* Initialize BT opaque state */
373 michael 628 62011 : opaque = BTPageGetOpaque(page);
9345 bruce 629 62011 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
774 pg 630 62011 : opaque->btpo_level = level;
7352 tgl 631 62011 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
6180 632 62011 : opaque->btpo_cycleid = 0;
633 :
634 : /* Make the P_HIKEY line pointer appear allocated */
6885 635 62011 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
636 :
637 62011 : return page;
638 : }
639 :
640 : /*
641 : * emit a completed btree page, and release the working storage.
642 : */
643 : static void
644 126124 : _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
645 : {
646 : /* XLOG stuff */
647 126124 : if (wstate->btws_use_wal)
648 : {
649 : /* We use the XLOG_FPI record type for this */
277 rhaas 650 GNC 113213 : log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
651 : }
652 :
653 : /*
654 : * If we have to write pages nonsequentially, fill in the space with
655 : * zeroes until we come back and overwrite. This is not logically
656 : * necessary on standard Unix filesystems (unwritten space will read as
657 : * zeroes anyway), but it should help to avoid fragmentation. The dummy
658 : * pages aren't WAL-logged though.
659 : */
6885 tgl 660 CBC 148363 : while (blkno > wstate->btws_pages_written)
661 : {
662 22239 : if (!wstate->btws_zeropage)
1 tmunro 663 GNC 18501 : wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
664 : PG_IO_ALIGN_SIZE,
665 : MCXT_ALLOC_ZERO);
666 : /* don't set checksum for all-zero page */
636 tgl 667 GIC 22239 : smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
5354 heikki.linnakangas 668 22239 : wstate->btws_pages_written++,
41 peter 669 GNC 22239 : wstate->btws_zeropage,
5940 tgl 670 ECB : true);
6885 671 : }
672 :
3670 simon 673 GIC 126124 : PageSetChecksumInplace(page, blkno);
674 :
6885 tgl 675 ECB : /*
676 : * Now write the page. There's no need for smgr to schedule an fsync for
677 : * this write; we'll do it ourselves before ending the build.
678 : */
6885 tgl 679 GIC 126124 : if (blkno == wstate->btws_pages_written)
680 : {
5940 tgl 681 ECB : /* extending the file... */
636 tgl 682 GIC 103885 : smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
683 : page, true);
6885 tgl 684 CBC 103885 : wstate->btws_pages_written++;
685 : }
5940 tgl 686 ECB : else
687 : {
688 : /* overwriting a block we zero-filled before */
636 tgl 689 GIC 22239 : smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
690 : page, true);
5940 tgl 691 ECB : }
692 :
6885 tgl 693 GIC 126124 : pfree(page);
7352 694 126124 : }
7352 tgl 695 ECB :
8297 696 : /*
697 : * allocate and initialize a new BTPageState. the returned structure
698 : * is suitable for immediate use by _bt_buildadd.
699 : */
700 : static BTPageState *
6885 tgl 701 GIC 23225 : _bt_pagestate(BTWriteState *wstate, uint32 level)
702 : {
7452 bruce 703 CBC 23225 : BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
704 :
6885 tgl 705 ECB : /* create initial page for level */
6885 tgl 706 GIC 23225 : state->btps_page = _bt_blnewpage(level);
707 :
6885 tgl 708 ECB : /* and assign it a page position */
6885 tgl 709 GIC 23225 : state->btps_blkno = wstate->btws_pages_alloced++;
710 :
1249 pg 711 CBC 23225 : state->btps_lowkey = NULL;
712 : /* initialize lastoff so first item goes into P_FIRSTKEY */
8297 tgl 713 23225 : state->btps_lastoff = P_HIKEY;
1138 pg 714 GIC 23225 : state->btps_lastextra = 0;
8297 tgl 715 CBC 23225 : state->btps_level = level;
8297 tgl 716 ECB : /* set "full" threshold based on level. See notes at head of file. */
6124 tgl 717 CBC 23225 : if (level > 0)
6116 tgl 718 GIC 4724 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
6124 tgl 719 ECB : else
1231 michael 720 CBC 18501 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
721 :
8297 tgl 722 ECB : /* no parent level, yet */
7032 neilc 723 GIC 23225 : state->btps_next = NULL;
724 :
8297 tgl 725 CBC 23225 : return state;
726 : }
8297 tgl 727 ECB :
728 : /*
729 : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
730 : * P_HIKEY, overwriting P_HIKEY).
731 : *
732 : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
733 : * rightmost page on its level is not supposed to get a high key. Now that
734 : * it's clear that this page is a rightmost page, remove the unneeded empty
735 : * P_HIKEY line pointer space.
736 : */
737 : static void
1007 pg 738 GIC 23225 : _bt_slideleft(Page rightmostpage)
739 : {
9344 bruce 740 ECB : OffsetNumber off;
741 : OffsetNumber maxoff;
742 : ItemId previi;
743 :
1007 pg 744 GIC 23225 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
745 23225 : Assert(maxoff >= P_FIRSTKEY);
1007 pg 746 CBC 23225 : previi = PageGetItemId(rightmostpage, P_HIKEY);
747 1620941 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
9345 bruce 748 ECB : {
1007 pg 749 CBC 1597716 : ItemId thisii = PageGetItemId(rightmostpage, off);
750 :
751 1597716 : *previi = *thisii;
1007 pg 752 GIC 1597716 : previi = thisii;
9552 scrappy 753 ECB : }
1007 pg 754 CBC 23225 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
9770 scrappy 755 GIC 23225 : }
9770 scrappy 756 ECB :
9552 757 : /*
758 : * Add an item to a page being built.
759 : *
760 : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
761 : * raises an error directly.
762 : *
763 : * Note that our nbtsort.c caller does not know yet if the page will be
764 : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
765 : * caller. Page that turns out to be the rightmost on its level is fixed by
766 : * calling _bt_slideleft().
767 : */
768 : static void
8297 tgl 769 GIC 11189149 : _bt_sortaddtup(Page page,
770 : Size itemsize,
6283 tgl 771 ECB : IndexTuple itup,
772 : OffsetNumber itup_off,
773 : bool newfirstdataitem)
774 : {
775 : IndexTupleData trunctuple;
776 :
1091 pg 777 GIC 11189149 : if (newfirstdataitem)
778 : {
6283 tgl 779 CBC 4786 : trunctuple = *itup;
6283 tgl 780 GIC 4786 : trunctuple.t_info = sizeof(IndexTupleData);
1097 pg 781 CBC 4786 : BTreeTupleSetNAtts(&trunctuple, 0, false);
6283 tgl 782 4786 : itup = &trunctuple;
783 4786 : itemsize = sizeof(IndexTupleData);
8297 tgl 784 ECB : }
9552 scrappy 785 :
6283 tgl 786 GIC 11189149 : if (PageAddItem(page, (Item) itup, itemsize, itup_off,
787 : false, false) == InvalidOffsetNumber)
7202 tgl 788 LBC 0 : elog(ERROR, "failed to add item to the index page");
9552 scrappy 789 GIC 11189149 : }
9552 scrappy 790 EUB :
8297 tgl 791 ECB : /*----------
792 : * Add an item to a disk page from the sort output (or add a posting list
793 : * item formed from the sort output).
794 : *
795 : * We must be careful to observe the page layout conventions of nbtsearch.c:
796 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
797 : * - on non-leaf pages, the key portion of the first item need not be
798 : * stored, we should store only the link.
799 : *
800 : * A leaf page being built looks like:
801 : *
802 : * +----------------+---------------------------------+
803 : * | PageHeaderData | linp0 linp1 linp2 ... |
804 : * +-----------+----+---------------------------------+
805 : * | ... linpN | |
806 : * +-----------+--------------------------------------+
807 : * | ^ last |
808 : * | |
809 : * +-------------+------------------------------------+
810 : * | | itemN ... |
811 : * +-------------+------------------+-----------------+
812 : * | ... item3 item2 item1 | "special space" |
813 : * +--------------------------------+-----------------+
814 : *
815 : * Contrast this with the diagram in bufpage.h; note the mismatch
816 : * between linps and items. This is because we reserve linp0 as a
817 : * placeholder for the pointer to the "high key" item; when we have
818 : * filled up the page, we will set linp0 to point to itemN and clear
819 : * linpN. On the other hand, if we find this is the last (rightmost)
820 : * page, we leave the items alone and slide the linp array over. If
821 : * the high key is to be truncated, offset 1 is deleted, and we insert
822 : * the truncated high key at offset 1.
823 : *
824 : * 'last' pointer indicates the last offset added to the page.
825 : *
826 : * 'truncextra' is the size of the posting list in itup, if any. This
827 : * information is stashed for the next call here, when we may benefit
828 : * from considering the impact of truncating away the posting list on
829 : * the page before deciding to finish the page off. Posting lists are
830 : * often relatively large, so it is worth going to the trouble of
831 : * accounting for the saving from truncating away the posting list of
832 : * the tuple that becomes the high key (that may be the only way to
833 : * get close to target free space on the page). Note that this is
834 : * only used for the soft fillfactor-wise limit, not the critical hard
835 : * limit.
836 : *----------
837 : */
838 : static void
1138 pg 839 GIC 11150363 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
840 : Size truncextra)
9770 scrappy 841 ECB : {
842 : Page npage;
843 : BlockNumber nblkno;
844 : OffsetNumber last_off;
845 : Size last_truncextra;
846 : Size pgspc;
847 : Size itupsz;
848 : bool isleaf;
849 :
850 : /*
851 : * This is a handy place to check for cancel interrupts during the btree
852 : * load phase of index creation.
853 : */
6239 tgl 854 GIC 11150363 : CHECK_FOR_INTERRUPTS();
855 :
9345 bruce 856 CBC 11150363 : npage = state->btps_page;
6885 tgl 857 GIC 11150363 : nblkno = state->btps_blkno;
9345 bruce 858 CBC 11150363 : last_off = state->btps_lastoff;
1138 pg 859 11150363 : last_truncextra = state->btps_lastextra;
860 11150363 : state->btps_lastextra = truncextra;
9345 bruce 861 ECB :
9345 bruce 862 CBC 11150363 : pgspc = PageGetFreeSpace(npage);
1866 tgl 863 GIC 11150363 : itupsz = IndexTupleSize(itup);
6283 tgl 864 CBC 11150363 : itupsz = MAXALIGN(itupsz);
1438 pg 865 ECB : /* Leaf case has slightly different rules due to suffix truncation */
1438 pg 866 CBC 11150363 : isleaf = (state->btps_level == 0);
867 :
8492 tgl 868 ECB : /*
869 : * Check whether the new item can fit on a btree page on current level at
870 : * all.
871 : *
872 : * Every newly built index will treat heap TID as part of the keyspace,
873 : * which imposes the requirement that new high keys must occasionally have
874 : * a heap TID appended within _bt_truncate(). That may leave a new pivot
875 : * tuple one or two MAXALIGN() quantums larger than the original
876 : * firstright tuple it's derived from. v4 deals with the problem by
877 : * decreasing the limit on the size of tuples inserted on the leaf level
878 : * by the same small amount. Enforce the new v4+ limit on the leaf level,
879 : * and the old limit on internal levels, since pivot tuples may need to
880 : * make use of the reserved space. This should never fail on internal
881 : * pages.
882 : */
1481 pg 883 GIC 11150363 : if (unlikely(itupsz > BTMaxItemSize(npage)))
1438 884 132 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
1438 pg 885 ECB : itup);
8492 tgl 886 :
887 : /*
888 : * Check to see if current page will fit new item, with space left over to
889 : * append a heap TID during suffix truncation when page is a leaf page.
890 : *
891 : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
892 : * high key with heap TID when finishing off a leaf page, since we rely on
893 : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
894 : * internal pages we can always fit 3 pivot tuples with larger internal
895 : * page tuple limit (includes page high key).
896 : *
897 : * Most of the time, a page is only "full" in the sense that the soft
898 : * fillfactor-wise limit has been exceeded. However, we must always leave
899 : * at least two items plus a high key on each page before starting a new
900 : * page. Disregard fillfactor and insert on "full" current page if we
901 : * don't have the minimum number of items yet. (Note that we deliberately
902 : * assume that suffix truncation neither enlarges nor shrinks new high key
903 : * when applying soft limit, except when last tuple has a posting list.)
904 : */
1104 pg 905 GIC 11150363 : Assert(last_truncextra == 0 || isleaf);
1438 906 11150363 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
1138 pg 907 CBC 11149799 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
9345 bruce 908 ECB : {
8297 tgl 909 : /*
910 : * Finish off the page and write it out.
911 : */
9344 bruce 912 GIC 38786 : Page opage = npage;
6797 913 38786 : BlockNumber oblkno = nblkno;
9344 bruce 914 ECB : ItemId ii;
915 : ItemId hii;
916 : IndexTuple oitup;
917 :
918 : /* Create new page of same level */
6885 tgl 919 GIC 38786 : npage = _bt_blnewpage(state->btps_level);
920 :
6885 tgl 921 ECB : /* and assign it a page position */
6885 tgl 922 GIC 38786 : nblkno = wstate->btws_pages_alloced++;
923 :
9345 bruce 924 ECB : /*
925 : * We copy the last item on the page into the new page, and then
926 : * rearrange the old page so that the 'last item' becomes its high key
927 : * rather than a true data item. There had better be at least two
928 : * items on the page already, else the page would be empty of useful
929 : * data.
930 : */
8297 tgl 931 GIC 38786 : Assert(last_off > P_FIRSTKEY);
932 38786 : ii = PageGetItemId(opage, last_off);
6283 tgl 933 CBC 38786 : oitup = (IndexTuple) PageGetItem(opage, ii);
1091 pg 934 38786 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
935 38786 : !isleaf);
9770 scrappy 936 ECB :
9345 bruce 937 : /*
938 : * Move 'last' into the high key position on opage. _bt_blnewpage()
939 : * allocated empty space for a line pointer when opage was first
940 : * created, so this is a matter of rearranging already-allocated space
941 : * on page, and initializing high key line pointer. (Actually, leaf
942 : * pages must also swap oitup with a truncated version of oitup, which
943 : * is sometimes larger than oitup, though never by more than the space
944 : * needed to append a heap TID.)
945 : */
9345 bruce 946 GIC 38786 : hii = PageGetItemId(opage, P_HIKEY);
947 38786 : *hii = *ii;
5688 tgl 948 CBC 38786 : ItemIdSetUnused(ii); /* redundant */
9345 bruce 949 38786 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
9770 scrappy 950 ECB :
1438 pg 951 CBC 38786 : if (isleaf)
952 : {
1481 pg 953 ECB : IndexTuple lastleft;
954 : IndexTuple truncated;
955 :
956 : /*
957 : * Truncate away any unneeded attributes from high key on leaf
958 : * level. This is only done at the leaf level because downlinks
959 : * in internal pages are either negative infinity items, or get
960 : * their contents from copying from one level down. See also:
961 : * _bt_split().
962 : *
963 : * We don't try to bias our choice of split point to make it more
964 : * likely that _bt_truncate() can truncate away more attributes,
965 : * whereas the split point used within _bt_split() is chosen much
966 : * more delicately. Even still, the lastleft and firstright
967 : * tuples passed to _bt_truncate() here are at least not fully
968 : * equal to each other when deduplication is used, unless there is
969 : * a large group of duplicates (also, unique index builds usually
970 : * have few or no spool2 duplicates). When the split point is
971 : * between two unequal tuples, _bt_truncate() will avoid including
972 : * a heap TID in the new high key, which is the most important
973 : * benefit of suffix truncation.
974 : *
975 : * Overwrite the old item with new truncated high key directly.
976 : * oitup is already located at the physical beginning of tuple
977 : * space, so this should directly reuse the existing tuple space.
978 : */
1481 pg 979 GIC 38724 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
980 38724 : lastleft = (IndexTuple) PageGetItem(opage, ii);
1481 pg 981 ECB :
1104 pg 982 CBC 38724 : Assert(IndexTupleSize(oitup) > last_truncextra);
1481 pg 983 GIC 38724 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
1481 pg 984 ECB : wstate->inskey);
1335 pg 985 CBC 38724 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
1335 pg 986 GIC 38724 : IndexTupleSize(truncated)))
1335 pg 987 LBC 0 : elog(ERROR, "failed to add high key to the index page");
1816 teodor 988 CBC 38724 : pfree(truncated);
1828 teodor 989 EUB :
1816 teodor 990 ECB : /* oitup should continue to point to the page's high key */
1816 teodor 991 GIC 38724 : hii = PageGetItemId(opage, P_HIKEY);
992 38724 : oitup = (IndexTuple) PageGetItem(opage, hii);
1828 teodor 993 ECB : }
994 :
995 : /*
996 : * Link the old page into its parent, using its low key. If we don't
997 : * have a parent, we have to create one; this adds a new btree level.
998 : */
7032 neilc 999 GIC 38786 : if (state->btps_next == NULL)
6885 tgl 1000 4724 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
7352 tgl 1001 ECB :
1249 pg 1002 CBC 38786 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
1003 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1249 pg 1004 ECB : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1005 : P_LEFTMOST(BTPageGetOpaque(opage)));
1249 pg 1006 GIC 38786 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1007 : !P_LEFTMOST(BTPageGetOpaque(opage)));
1210 pg 1008 CBC 38786 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1138 pg 1009 GIC 38786 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1249 pg 1010 CBC 38786 : pfree(state->btps_lowkey);
9770 scrappy 1011 ECB :
9345 bruce 1012 : /*
1013 : * Save a copy of the high key from the old page. It is also the low
1014 : * key for the new page.
1015 : */
1249 pg 1016 GIC 38786 : state->btps_lowkey = CopyIndexTuple(oitup);
1017 :
8297 tgl 1018 ECB : /*
1019 : * Set the sibling links for both pages.
1020 : */
1021 : {
373 michael 1022 GIC 38786 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
1023 38786 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
9345 bruce 1024 ECB :
6885 tgl 1025 CBC 38786 : oopaque->btpo_next = nblkno;
6885 tgl 1026 GIC 38786 : nopaque->btpo_prev = oblkno;
2118 tgl 1027 CBC 38786 : nopaque->btpo_next = P_NONE; /* redundant */
9345 bruce 1028 ECB : }
1029 :
1030 : /*
1031 : * Write out the old page. We never need to touch it again, so we can
1032 : * free the opage workspace too.
1033 : */
6885 tgl 1034 GIC 38786 : _bt_blwritepage(wstate, opage, oblkno);
1035 :
9345 bruce 1036 ECB : /*
1037 : * Reset last_off to point to new page
1038 : */
8297 tgl 1039 GIC 38786 : last_off = P_FIRSTKEY;
1040 : }
9552 scrappy 1041 ECB :
1042 : /*
1043 : * By here, either original page is still the current page, or a new page
1044 : * was created that became the current page. Either way, the current page
1045 : * definitely has space for new item.
1046 : *
1047 : * If the new item is the first for its page, it must also be the first
1048 : * item on its entire level. On later same-level pages, a low key for a
1049 : * page will be copied from the prior page in the code above. Generate a
1050 : * minus infinity low key here instead.
1051 : */
8297 tgl 1052 GIC 11150363 : if (last_off == P_HIKEY)
1053 : {
1249 pg 1054 CBC 23225 : Assert(state->btps_lowkey == NULL);
1249 pg 1055 GIC 23225 : state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1249 pg 1056 CBC 23225 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
1097 1057 23225 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
9345 bruce 1058 ECB : }
8297 tgl 1059 :
1060 : /*
1061 : * Add the new item into the current page.
1062 : */
8297 tgl 1063 GIC 11150363 : last_off = OffsetNumberNext(last_off);
1091 pg 1064 11150363 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1091 pg 1065 CBC 11150363 : !isleaf && last_off == P_FIRSTKEY);
9345 bruce 1066 ECB :
9345 bruce 1067 CBC 11150363 : state->btps_page = npage;
6885 tgl 1068 GIC 11150363 : state->btps_blkno = nblkno;
9345 bruce 1069 CBC 11150363 : state->btps_lastoff = last_off;
9552 scrappy 1070 11150363 : }
9552 scrappy 1071 ECB :
1138 pg 1072 : /*
1073 : * Finalize pending posting list tuple, and add it to the index. Final tuple
1074 : * is based on saved base tuple, and saved list of heap TIDs.
1075 : *
1076 : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1077 : * using _bt_buildadd().
1078 : */
1079 : static void
1138 pg 1080 GIC 2402111 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1081 : BTDedupState dstate)
1138 pg 1082 ECB : {
1138 pg 1083 GIC 2402111 : Assert(dstate->nitems > 0);
1084 :
1138 pg 1085 CBC 2402111 : if (dstate->nitems == 1)
1138 pg 1086 GIC 2372355 : _bt_buildadd(wstate, state, dstate->base, 0);
1138 pg 1087 ECB : else
1088 : {
1089 : IndexTuple postingtuple;
1090 : Size truncextra;
1091 :
1092 : /* form a tuple with a posting list */
1138 pg 1093 GIC 29756 : postingtuple = _bt_form_posting(dstate->base,
1094 : dstate->htids,
1138 pg 1095 ECB : dstate->nhtids);
1096 : /* Calculate posting list overhead */
1138 pg 1097 GIC 59512 : truncextra = IndexTupleSize(postingtuple) -
1098 29756 : BTreeTupleGetPostingOffset(postingtuple);
1138 pg 1099 ECB :
1138 pg 1100 CBC 29756 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1138 pg 1101 GIC 29756 : pfree(postingtuple);
1138 pg 1102 ECB : }
1103 :
1024 pg 1104 GIC 2402111 : dstate->nmaxitems = 0;
1138 1105 2402111 : dstate->nhtids = 0;
1138 pg 1106 CBC 2402111 : dstate->nitems = 0;
1107 2402111 : dstate->phystupsize = 0;
1108 2402111 : }
1138 pg 1109 ECB :
8297 tgl 1110 : /*
1111 : * Finish writing out the completed btree.
1112 : */
1113 : static void
6885 tgl 1114 GIC 64113 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1115 : {
9344 bruce 1116 ECB : BTPageState *s;
6797 bruce 1117 GIC 64113 : BlockNumber rootblkno = P_NONE;
7132 tgl 1118 64113 : uint32 rootlevel = 0;
6885 tgl 1119 ECB : Page metapage;
9552 scrappy 1120 :
1121 : /*
1122 : * Each iteration of this loop completes one more level of the tree.
1123 : */
7032 neilc 1124 GIC 87338 : for (s = state; s != NULL; s = s->btps_next)
1125 : {
8297 tgl 1126 ECB : BlockNumber blkno;
1127 : BTPageOpaque opaque;
1128 :
6885 tgl 1129 GIC 23225 : blkno = s->btps_blkno;
373 michael 1130 23225 : opaque = BTPageGetOpaque(s->btps_page);
9552 scrappy 1131 ECB :
9345 bruce 1132 : /*
1133 : * We have to link the last page on this level to somewhere.
1134 : *
1135 : * If we're at the top, it's the root, so attach it to the metapage.
1136 : * Otherwise, add an entry for it to its parent using its low key.
1137 : * This may cause the last page of the parent level to split, but
1138 : * that's not a problem -- we haven't gotten to it yet.
1139 : */
7032 neilc 1140 GIC 23225 : if (s->btps_next == NULL)
1141 : {
8297 tgl 1142 CBC 18501 : opaque->btpo_flags |= BTP_ROOT;
7132 tgl 1143 GIC 18501 : rootblkno = blkno;
7132 tgl 1144 CBC 18501 : rootlevel = s->btps_level;
8297 tgl 1145 ECB : }
1146 : else
1147 : {
1249 pg 1148 GIC 4724 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1149 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1249 pg 1150 ECB : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1151 : P_LEFTMOST(opaque));
1249 pg 1152 GIC 4724 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1153 : !P_LEFTMOST(opaque));
1210 pg 1154 CBC 4724 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1138 pg 1155 GIC 4724 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1249 pg 1156 CBC 4724 : pfree(s->btps_lowkey);
1157 4724 : s->btps_lowkey = NULL;
9345 bruce 1158 ECB : }
9552 scrappy 1159 :
1160 : /*
1161 : * This is the rightmost page, so the ItemId array needs to be slid
1162 : * back one slot. Then we can dump out the page.
1163 : */
6885 tgl 1164 GIC 23225 : _bt_slideleft(s->btps_page);
1165 23225 : _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
6885 tgl 1166 CBC 23225 : s->btps_page = NULL; /* writepage freed the workspace */
9345 bruce 1167 ECB : }
7132 tgl 1168 :
1169 : /*
1170 : * As the last step in the process, construct the metapage and make it
1171 : * point to the new root (unless we had no data at all, in which case it's
1172 : * set to point to "P_NONE"). This changes the index to the "valid" state
1173 : * by filling in a valid magic number in the metapage.
1174 : */
1 tmunro 1175 GNC 64113 : metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
1138 pg 1176 GIC 64113 : _bt_initmetapage(metapage, rootblkno, rootlevel,
1138 pg 1177 CBC 64113 : wstate->inskey->allequalimage);
6885 tgl 1178 64113 : _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
9770 scrappy 1179 64113 : }
9770 scrappy 1180 ECB :
1181 : /*
1182 : * Read tuples in correct sort order from tuplesort, and load them into
1183 : * btree leaves.
1184 : */
1185 : static void
6885 tgl 1186 GIC 64113 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1187 : {
8297 tgl 1188 CBC 64113 : BTPageState *state = NULL;
8277 inoue 1189 GIC 64113 : bool merge = (btspool2 != NULL);
6283 tgl 1190 ECB : IndexTuple itup,
6283 tgl 1191 CBC 64113 : itup2 = NULL;
1192 : bool load1;
6885 1193 64113 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1194 : int i,
1828 teodor 1195 64113 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1196 : SortSupport sortKeys;
1441 alvherre 1197 64113 : int64 tuples_done = 0;
1198 : bool deduplicate;
1138 pg 1199 ECB :
996 pg 1200 GIC 71550 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1138 1201 7437 : BTGetDeduplicateItems(wstate->index);
8277 inoue 1202 ECB :
8277 inoue 1203 CBC 64113 : if (merge)
1204 : {
8277 inoue 1205 ECB : /*
1206 : * Another BTSpool for dead tuples exists. Now we have to merge
1207 : * btspool and btspool2.
1208 : */
1209 :
1210 : /* the preparation of merge */
2309 rhaas 1211 GIC 13 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1212 13 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
7088 tgl 1213 ECB :
3075 rhaas 1214 : /* Prepare SortSupport data for each column */
3075 rhaas 1215 GIC 13 : sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1216 :
3075 rhaas 1217 CBC 27 : for (i = 0; i < keysz; i++)
1218 : {
1219 14 : SortSupport sortKey = sortKeys + i;
1481 pg 1220 GIC 14 : ScanKey scanKey = wstate->inskey->scankeys + i;
3075 rhaas 1221 ECB : int16 strategy;
1222 :
3075 rhaas 1223 GIC 14 : sortKey->ssup_cxt = CurrentMemoryContext;
1224 14 : sortKey->ssup_collation = scanKey->sk_collation;
3075 rhaas 1225 CBC 14 : sortKey->ssup_nulls_first =
1226 14 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1227 14 : sortKey->ssup_attno = scanKey->sk_attno;
3002 rhaas 1228 ECB : /* Abbreviation is not supported here */
3002 rhaas 1229 CBC 14 : sortKey->abbreviate = false;
1230 :
163 peter 1231 GNC 14 : Assert(sortKey->ssup_attno != 0);
1232 :
3075 rhaas 1233 CBC 14 : strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1234 : BTGreaterStrategyNumber : BTLessStrategyNumber;
3075 rhaas 1235 ECB :
3075 rhaas 1236 GIC 14 : PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1237 : }
3075 rhaas 1238 ECB :
1239 : for (;;)
1240 : {
8053 bruce 1241 GIC 1626 : load1 = true; /* load BTSpool next ? */
6283 tgl 1242 1626 : if (itup2 == NULL)
8277 inoue 1243 ECB : {
6283 tgl 1244 CBC 75 : if (itup == NULL)
8277 inoue 1245 GIC 13 : break;
8277 inoue 1246 ECB : }
6283 tgl 1247 CBC 1551 : else if (itup != NULL)
1248 : {
1481 pg 1249 1460 : int32 compare = 0;
1250 :
8277 inoue 1251 1583 : for (i = 1; i <= keysz; i++)
1252 : {
2878 bruce 1253 ECB : SortSupport entry;
1254 : Datum attrDatum1,
1255 : attrDatum2;
1256 : bool isNull1,
1257 : isNull2;
1258 :
3075 rhaas 1259 GIC 1488 : entry = sortKeys + i - 1;
5934 tgl 1260 1488 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
5934 tgl 1261 CBC 1488 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
3075 rhaas 1262 ECB :
3075 rhaas 1263 CBC 1488 : compare = ApplySortComparator(attrDatum1, isNull1,
1264 : attrDatum2, isNull2,
3075 rhaas 1265 ECB : entry);
5934 tgl 1266 GIC 1488 : if (compare > 0)
1267 : {
5934 tgl 1268 CBC 114 : load1 = false;
5934 tgl 1269 GIC 1365 : break;
5934 tgl 1270 ECB : }
5934 tgl 1271 CBC 1374 : else if (compare < 0)
5934 tgl 1272 GIC 1251 : break;
8277 inoue 1273 ECB : }
1481 pg 1274 :
1275 : /*
1276 : * If key values are equal, we sort on ItemPointer. This is
1277 : * required for btree indexes, since heap TID is treated as an
1278 : * implicit last key attribute in order to ensure that all
1279 : * keys in the index are physically unique.
1280 : */
1481 pg 1281 GIC 1460 : if (compare == 0)
1282 : {
1481 pg 1283 CBC 95 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1481 pg 1284 GIC 95 : Assert(compare != 0);
1481 pg 1285 CBC 95 : if (compare > 0)
1286 20 : load1 = false;
1481 pg 1287 ECB : }
8277 inoue 1288 : }
1289 : else
8277 inoue 1290 GIC 91 : load1 = false;
1291 :
8277 inoue 1292 ECB : /* When we see first tuple, create first index page */
8277 inoue 1293 GIC 1613 : if (state == NULL)
6885 tgl 1294 13 : state = _bt_pagestate(wstate, 0);
8277 inoue 1295 ECB :
8277 inoue 1296 CBC 1613 : if (load1)
1297 : {
1138 pg 1298 1388 : _bt_buildadd(wstate, state, itup, 0);
2309 rhaas 1299 GIC 1388 : itup = tuplesort_getindextuple(btspool->sortstate, true);
8277 inoue 1300 ECB : }
1301 : else
1302 : {
1138 pg 1303 GIC 225 : _bt_buildadd(wstate, state, itup2, 0);
2309 rhaas 1304 225 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
8277 inoue 1305 ECB : }
1468 alvherre 1306 :
1307 : /* Report progress */
1468 alvherre 1308 GIC 1613 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1309 : ++tuples_done);
8277 inoue 1310 ECB : }
3075 rhaas 1311 GIC 13 : pfree(sortKeys);
1312 : }
1138 pg 1313 CBC 64100 : else if (deduplicate)
1314 : {
1138 pg 1315 ECB : /* merge is unnecessary, deduplicate into posting lists */
1316 : BTDedupState dstate;
1317 :
1138 pg 1318 GIC 7437 : dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1319 7437 : dstate->deduplicate = true; /* unused */
1024 pg 1320 CBC 7437 : dstate->nmaxitems = 0; /* unused */
1138 1321 7437 : dstate->maxpostingsize = 0; /* set later */
1138 pg 1322 ECB : /* Metadata about base tuple of current pending posting list */
1138 pg 1323 CBC 7437 : dstate->base = NULL;
1138 pg 1324 GIC 7437 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1138 pg 1325 CBC 7437 : dstate->basetupsize = 0;
1138 pg 1326 ECB : /* Metadata about current pending posting list TIDs */
1138 pg 1327 CBC 7437 : dstate->htids = NULL;
1138 pg 1328 GIC 7437 : dstate->nhtids = 0;
1138 pg 1329 CBC 7437 : dstate->nitems = 0;
1330 7437 : dstate->phystupsize = 0; /* unused */
1331 7437 : dstate->nintervals = 0; /* unused */
1138 pg 1332 ECB :
1138 pg 1333 CBC 3072212 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1138 pg 1334 GIC 3072212 : true)) != NULL)
1138 pg 1335 ECB : {
1336 : /* When we see first tuple, create first index page */
1138 pg 1337 GIC 3064775 : if (state == NULL)
1338 : {
1138 pg 1339 CBC 1563 : state = _bt_pagestate(wstate, 0);
1340 :
1138 pg 1341 ECB : /*
1342 : * Limit size of posting list tuples to 1/10 space we want to
1343 : * leave behind on the page, plus space for final item's line
1344 : * pointer. This is equal to the space that we'd like to
1345 : * leave behind on each leaf page when fillfactor is 90,
1346 : * allowing us to get close to fillfactor% space utilization
1347 : * when there happen to be a great many duplicates. (This
1348 : * makes higher leaf fillfactor settings ineffective when
1349 : * building indexes that have many duplicates, but packing
1350 : * leaf pages full with few very large tuples doesn't seem
1351 : * like a useful goal.)
1352 : */
1138 pg 1353 GIC 1563 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1354 : sizeof(ItemIdData);
1138 pg 1355 CBC 1563 : Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1356 : dstate->maxpostingsize <= INDEX_SIZE_MASK);
1357 1563 : dstate->htids = palloc(dstate->maxpostingsize);
1358 :
1138 pg 1359 ECB : /* start new pending posting list with itup copy */
1138 pg 1360 GIC 1563 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1361 : InvalidOffsetNumber);
1138 pg 1362 ECB : }
1138 pg 1363 GIC 3063212 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1364 666407 : itup) > keysz &&
1138 pg 1365 CBC 666407 : _bt_dedup_save_htid(dstate, itup))
1138 pg 1366 ECB : {
1367 : /*
1368 : * Tuple is equal to base tuple of pending posting list. Heap
1369 : * TID from itup has been saved in state.
1370 : */
1371 : }
1372 : else
1373 : {
1374 : /*
1375 : * Tuple is not equal to pending posting list tuple, or
1376 : * _bt_dedup_save_htid() opted to not merge current item into
1377 : * pending posting list.
1378 : */
1138 pg 1379 GIC 2400548 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1380 2400548 : pfree(dstate->base);
1138 pg 1381 ECB :
1382 : /* start new pending posting list with itup copy */
1138 pg 1383 GIC 2400548 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1384 : InvalidOffsetNumber);
1138 pg 1385 ECB : }
1386 :
1387 : /* Report progress */
1138 pg 1388 GIC 3064775 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1389 : ++tuples_done);
1138 pg 1390 ECB : }
1391 :
1138 pg 1392 GIC 7437 : if (state)
1393 : {
1138 pg 1394 ECB : /*
1395 : * Handle the last item (there must be a last item when the
1396 : * tuplesort returned one or more tuples)
1397 : */
1138 pg 1398 GIC 1563 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1399 1563 : pfree(dstate->base);
1138 pg 1400 CBC 1563 : pfree(dstate->htids);
1138 pg 1401 ECB : }
1402 :
1138 pg 1403 GIC 7437 : pfree(dstate);
1404 : }
8053 bruce 1405 ECB : else
1406 : {
1407 : /* merging and deduplication are both unnecessary */
6283 tgl 1408 GIC 8759792 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
2309 rhaas 1409 8759792 : true)) != NULL)
8277 inoue 1410 ECB : {
1411 : /* When we see first tuple, create first index page */
8277 inoue 1412 GIC 8703129 : if (state == NULL)
6885 tgl 1413 16925 : state = _bt_pagestate(wstate, 0);
8297 tgl 1414 ECB :
1138 pg 1415 CBC 8703129 : _bt_buildadd(wstate, state, itup, 0);
1416 :
1468 alvherre 1417 ECB : /* Report progress */
1468 alvherre 1418 GIC 8703129 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1419 : ++tuples_done);
8277 inoue 1420 ECB : }
1421 : }
1422 :
1423 : /* Close down final pages and write the metapage */
6885 tgl 1424 GIC 64113 : _bt_uppershutdown(wstate, state);
1425 :
6885 tgl 1426 ECB : /*
1427 : * When we WAL-logged index pages, we must nonetheless fsync index files.
1428 : * Since we're building outside shared buffers, a CHECKPOINT occurring
1429 : * during the build has no way to flush the previously written data to
1430 : * disk (indeed it won't know the index even exists). A crash later on
1431 : * would replay WAL from the checkpoint, therefore it wouldn't replay our
1432 : * earlier WAL entries. If we do not fsync those pages here, they might
1433 : * still not be on disk when the crash occurs.
1434 : */
1100 noah 1435 GIC 64113 : if (wstate->btws_use_wal)
636 tgl 1436 59235 : smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
9770 scrappy 1437 CBC 64113 : }
1892 rhaas 1438 ECB :
1439 : /*
1440 : * Create parallel context, and launch workers for leader.
1441 : *
1442 : * buildstate argument should be initialized (with the exception of the
1443 : * tuplesort state in spools, which may later be created based on shared
1444 : * state initially set up here).
1445 : *
1446 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1447 : *
1448 : * request is the target number of parallel worker processes to launch.
1449 : *
1450 : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1451 : * mode by passing it to _bt_end_parallel() at the very end of its index
1452 : * build. If not even a single worker process can be launched, this is
1453 : * never set, and caller should proceed with a serial index build.
1454 : */
1455 : static void
1892 rhaas 1456 GIC 71 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1457 : {
1892 rhaas 1458 ECB : ParallelContext *pcxt;
1459 : int scantuplesortstates;
1460 : Snapshot snapshot;
1461 : Size estbtshared;
1462 : Size estsort;
1463 : BTShared *btshared;
1464 : Sharedsort *sharedsort;
1465 : Sharedsort *sharedsort2;
1892 rhaas 1466 GIC 71 : BTSpool *btspool = buildstate->spool;
1467 71 : BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1100 akapila 1468 ECB : WalUsage *walusage;
1095 1469 : BufferUsage *bufferusage;
1892 rhaas 1470 GIC 71 : bool leaderparticipates = true;
1471 : int querylen;
1892 rhaas 1472 ECB :
1473 : #ifdef DISABLE_LEADER_PARTICIPATION
1474 : leaderparticipates = false;
1475 : #endif
1476 :
1477 : /*
1478 : * Enter parallel mode, and create context for parallel build of btree
1479 : * index
1480 : */
1892 rhaas 1481 GIC 71 : EnterParallelMode();
1482 71 : Assert(request > 0);
1892 rhaas 1483 CBC 71 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1486 tmunro 1484 ECB : request);
1164 1485 :
1892 rhaas 1486 GIC 71 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1487 :
1892 rhaas 1488 ECB : /*
1489 : * Prepare for scan of the base relation. In a normal index build, we use
1490 : * SnapshotAny because we must retrieve all tuples and do our own time
1491 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1492 : * concurrent build, we take a regular MVCC snapshot and index whatever's
1493 : * live according to that.
1494 : */
1892 rhaas 1495 GIC 71 : if (!isconcurrent)
1496 71 : snapshot = SnapshotAny;
1892 rhaas 1497 ECB : else
1892 rhaas 1498 LBC 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1499 :
1892 rhaas 1500 EUB : /*
1501 : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1502 : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1503 : */
1490 andres 1504 GIC 71 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1892 rhaas 1505 71 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1892 rhaas 1506 CBC 71 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1507 71 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1892 rhaas 1508 ECB :
1509 : /*
1510 : * Unique case requires a second spool, and so we may have to account for
1511 : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1512 : */
1892 rhaas 1513 GIC 71 : if (!btspool->isunique)
1514 39 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1892 rhaas 1515 ECB : else
1516 : {
1892 rhaas 1517 GIC 32 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1518 32 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1892 rhaas 1519 ECB : }
1520 :
1521 : /*
1522 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1523 : * and PARALLEL_KEY_BUFFER_USAGE.
1524 : *
1525 : * If there are no extensions loaded that care, we could skip this. We
1526 : * have no way of knowing whether anyone's looking at pgWalUsage or
1527 : * pgBufferUsage, so do it unconditionally.
1528 : */
1100 akapila 1529 GIC 71 : shm_toc_estimate_chunk(&pcxt->estimator,
1530 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1100 akapila 1531 CBC 71 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1095 akapila 1532 GIC 71 : shm_toc_estimate_chunk(&pcxt->estimator,
1095 akapila 1533 ECB : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1095 akapila 1534 CBC 71 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1535 :
1844 rhaas 1536 ECB : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
890 noah 1537 GIC 71 : if (debug_query_string)
1538 : {
890 noah 1539 CBC 71 : querylen = strlen(debug_query_string);
890 noah 1540 GIC 71 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
890 noah 1541 CBC 71 : shm_toc_estimate_keys(&pcxt->estimator, 1);
890 noah 1542 ECB : }
1543 : else
890 noah 1544 UIC 0 : querylen = 0; /* keep compiler quiet */
1545 :
1892 rhaas 1546 EUB : /* Everyone's had a chance to ask for space, so now create the DSM */
1892 rhaas 1547 GIC 71 : InitializeParallelDSM(pcxt);
1548 :
1159 tmunro 1549 ECB : /* If no DSM segment was available, back out (do serial build) */
1159 tmunro 1550 GIC 71 : if (pcxt->seg == NULL)
1551 : {
1159 tmunro 1552 LBC 0 : if (IsMVCCSnapshot(snapshot))
1159 tmunro 1553 UIC 0 : UnregisterSnapshot(snapshot);
1159 tmunro 1554 UBC 0 : DestroyParallelContext(pcxt);
1555 0 : ExitParallelMode();
1556 0 : return;
1159 tmunro 1557 EUB : }
1558 :
1559 : /* Store shared build state, for which we reserved space */
1892 rhaas 1560 GIC 71 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1561 : /* Initialize immutable state */
1892 rhaas 1562 CBC 71 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1892 rhaas 1563 GIC 71 : btshared->indexrelid = RelationGetRelid(btspool->index);
1892 rhaas 1564 CBC 71 : btshared->isunique = btspool->isunique;
430 peter 1565 71 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1892 rhaas 1566 71 : btshared->isconcurrent = isconcurrent;
1567 71 : btshared->scantuplesortstates = scantuplesortstates;
1568 71 : ConditionVariableInit(&btshared->workersdonecv);
1569 71 : SpinLockInit(&btshared->mutex);
1892 rhaas 1570 ECB : /* Initialize mutable state */
1892 rhaas 1571 CBC 71 : btshared->nparticipantsdone = 0;
1892 rhaas 1572 GIC 71 : btshared->reltuples = 0.0;
1892 rhaas 1573 CBC 71 : btshared->havedead = false;
1574 71 : btshared->indtuples = 0.0;
1575 71 : btshared->brokenhotchain = false;
1490 andres 1576 71 : table_parallelscan_initialize(btspool->heap,
1490 andres 1577 ECB : ParallelTableScanFromBTShared(btshared),
1578 : snapshot);
1579 :
1580 : /*
1581 : * Store shared tuplesort-private state, for which we reserved space.
1582 : * Then, initialize opaque state using tuplesort routine.
1583 : */
1892 rhaas 1584 GIC 71 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1585 71 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1892 rhaas 1586 ECB : pcxt->seg);
1587 :
1892 rhaas 1588 GIC 71 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1589 71 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1892 rhaas 1590 ECB :
1591 : /* Unique case requires a second spool, and associated shared state */
1892 rhaas 1592 GIC 71 : if (!btspool->isunique)
1593 39 : sharedsort2 = NULL;
1892 rhaas 1594 ECB : else
1595 : {
1596 : /*
1597 : * Store additional shared tuplesort-private state, for which we
1598 : * reserved space. Then, initialize opaque state using tuplesort
1599 : * routine.
1600 : */
1892 rhaas 1601 GIC 32 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1602 32 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1892 rhaas 1603 ECB : pcxt->seg);
1604 :
1892 rhaas 1605 GIC 32 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1606 : }
1892 rhaas 1607 ECB :
1608 : /* Store query string for workers */
890 noah 1609 GIC 71 : if (debug_query_string)
1610 : {
890 noah 1611 ECB : char *sharedquery;
1612 :
890 noah 1613 GIC 71 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1614 71 : memcpy(sharedquery, debug_query_string, querylen + 1);
890 noah 1615 CBC 71 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
890 noah 1616 ECB : }
1844 rhaas 1617 :
1618 : /*
1619 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1620 : * initialize.
1621 : */
1100 akapila 1622 GIC 71 : walusage = shm_toc_allocate(pcxt->toc,
1623 71 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1100 akapila 1624 CBC 71 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1095 1625 71 : bufferusage = shm_toc_allocate(pcxt->toc,
1626 71 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1627 71 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1100 akapila 1628 ECB :
1892 rhaas 1629 : /* Launch workers, saving status for leader/caller */
1892 rhaas 1630 GIC 71 : LaunchParallelWorkers(pcxt);
1631 71 : btleader->pcxt = pcxt;
1892 rhaas 1632 CBC 71 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1633 71 : if (leaderparticipates)
1634 71 : btleader->nparticipanttuplesorts++;
1635 71 : btleader->btshared = btshared;
1636 71 : btleader->sharedsort = sharedsort;
1637 71 : btleader->sharedsort2 = sharedsort2;
1638 71 : btleader->snapshot = snapshot;
1100 akapila 1639 71 : btleader->walusage = walusage;
1095 1640 71 : btleader->bufferusage = bufferusage;
1892 rhaas 1641 ECB :
1642 : /* If no workers were successfully launched, back out (do serial build) */
1892 rhaas 1643 GIC 71 : if (pcxt->nworkers_launched == 0)
1644 : {
1892 rhaas 1645 LBC 0 : _bt_end_parallel(btleader);
1892 rhaas 1646 UIC 0 : return;
1892 rhaas 1647 EUB : }
1648 :
1649 : /* Save leader state now that it's clear build will be parallel */
1892 rhaas 1650 GIC 71 : buildstate->btleader = btleader;
1651 :
1892 rhaas 1652 ECB : /* Join heap scan ourselves */
1892 rhaas 1653 GIC 71 : if (leaderparticipates)
1654 71 : _bt_leader_participate_as_worker(buildstate);
1892 rhaas 1655 ECB :
1656 : /*
1657 : * Caller needs to wait for all launched workers when we return. Make
1658 : * sure that the failure-to-start case will not hang forever.
1659 : */
1892 rhaas 1660 GIC 71 : WaitForParallelWorkersToAttach(pcxt);
1661 : }
1892 rhaas 1662 ECB :
1663 : /*
1664 : * Shut down workers, destroy parallel context, and end parallel mode.
1665 : */
1666 : static void
1892 rhaas 1667 GIC 71 : _bt_end_parallel(BTLeader *btleader)
1668 : {
1100 akapila 1669 ECB : int i;
1670 :
1671 : /* Shutdown worker processes */
1892 rhaas 1672 GIC 71 : WaitForParallelWorkersToFinish(btleader->pcxt);
1673 :
1100 akapila 1674 ECB : /*
1675 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1676 : * or we might get incomplete data.)
1677 : */
1100 akapila 1678 GIC 142 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1095 1679 71 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1100 akapila 1680 ECB :
1892 rhaas 1681 : /* Free last reference to MVCC snapshot, if one was used */
1892 rhaas 1682 GIC 71 : if (IsMVCCSnapshot(btleader->snapshot))
1892 rhaas 1683 UIC 0 : UnregisterSnapshot(btleader->snapshot);
1892 rhaas 1684 CBC 71 : DestroyParallelContext(btleader->pcxt);
1892 rhaas 1685 GBC 71 : ExitParallelMode();
1892 rhaas 1686 CBC 71 : }
1892 rhaas 1687 ECB :
1688 : /*
1689 : * Returns size of shared memory required to store state for a parallel
1690 : * btree index build based on the snapshot its parallel scan will use.
1691 : */
1692 : static Size
1490 andres 1693 GIC 71 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1694 : {
1490 andres 1695 ECB : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1490 andres 1696 GIC 71 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1697 : table_parallelscan_estimate(heap, snapshot));
1892 rhaas 1698 ECB : }
1699 :
1700 : /*
1701 : * Within leader, wait for end of heap scan.
1702 : *
1703 : * When called, parallel heap scan started by _bt_begin_parallel() will
1704 : * already be underway within worker processes (when leader participates
1705 : * as a worker, we should end up here just as workers are finishing).
1706 : *
1707 : * Fills in fields needed for ambuild statistics, and lets caller set
1708 : * field indicating that some worker encountered a broken HOT chain.
1709 : *
1710 : * Returns the total number of heap tuples scanned.
1711 : */
1712 : static double
1892 rhaas 1713 GIC 71 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1714 : {
1892 rhaas 1715 CBC 71 : BTShared *btshared = buildstate->btleader->btshared;
1716 : int nparticipanttuplesorts;
1892 rhaas 1717 ECB : double reltuples;
1718 :
1892 rhaas 1719 GIC 71 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1720 : for (;;)
1892 rhaas 1721 ECB : {
1892 rhaas 1722 GIC 185 : SpinLockAcquire(&btshared->mutex);
1723 185 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1892 rhaas 1724 ECB : {
1892 rhaas 1725 CBC 71 : buildstate->havedead = btshared->havedead;
1892 rhaas 1726 GIC 71 : buildstate->indtuples = btshared->indtuples;
1892 rhaas 1727 CBC 71 : *brokenhotchain = btshared->brokenhotchain;
1728 71 : reltuples = btshared->reltuples;
1729 71 : SpinLockRelease(&btshared->mutex);
1730 71 : break;
1892 rhaas 1731 ECB : }
1892 rhaas 1732 CBC 114 : SpinLockRelease(&btshared->mutex);
1733 :
1734 114 : ConditionVariableSleep(&btshared->workersdonecv,
1735 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1892 rhaas 1736 ECB : }
1737 :
1892 rhaas 1738 GIC 71 : ConditionVariableCancelSleep();
1739 :
1892 rhaas 1740 CBC 71 : return reltuples;
1741 : }
1892 rhaas 1742 ECB :
1743 : /*
1744 : * Within leader, participate as a parallel worker.
1745 : */
1746 : static void
1892 rhaas 1747 GIC 71 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1748 : {
1892 rhaas 1749 CBC 71 : BTLeader *btleader = buildstate->btleader;
1750 : BTSpool *leaderworker;
1892 rhaas 1751 ECB : BTSpool *leaderworker2;
1752 : int sortmem;
1753 :
1754 : /* Allocate memory and initialize private spool */
1892 rhaas 1755 GIC 71 : leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1756 71 : leaderworker->heap = buildstate->spool->heap;
1892 rhaas 1757 CBC 71 : leaderworker->index = buildstate->spool->index;
1758 71 : leaderworker->isunique = buildstate->spool->isunique;
430 peter 1759 71 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1892 rhaas 1760 ECB :
1761 : /* Initialize second spool, if required */
1892 rhaas 1762 GIC 71 : if (!btleader->btshared->isunique)
1763 39 : leaderworker2 = NULL;
1892 rhaas 1764 ECB : else
1765 : {
1766 : /* Allocate memory for worker's own private secondary spool */
1892 rhaas 1767 GIC 32 : leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1768 :
1892 rhaas 1769 ECB : /* Initialize worker's own secondary spool */
1892 rhaas 1770 GIC 32 : leaderworker2->heap = leaderworker->heap;
1771 32 : leaderworker2->index = leaderworker->index;
1892 rhaas 1772 CBC 32 : leaderworker2->isunique = false;
1892 rhaas 1773 ECB : }
1774 :
1775 : /*
1776 : * Might as well use reliable figure when doling out maintenance_work_mem
1777 : * (when requested number of workers were not launched, this will be
1778 : * somewhat higher than it is for other workers).
1779 : */
1892 rhaas 1780 GIC 71 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1781 :
1892 rhaas 1782 ECB : /* Perform work common to all participants */
1892 rhaas 1783 GIC 71 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1784 : btleader->sharedsort, btleader->sharedsort2,
1468 alvherre 1785 ECB : sortmem, true);
1786 :
1787 : #ifdef BTREE_BUILD_STATS
1788 : if (log_btree_build_stats)
1789 : {
1790 : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1791 : ResetUsage();
1792 : }
1793 : #endif /* BTREE_BUILD_STATS */
1892 rhaas 1794 GIC 71 : }
1795 :
1892 rhaas 1796 ECB : /*
1797 : * Perform work within a launched parallel process.
1798 : */
1799 : void
1892 rhaas 1800 GIC 71 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1801 : {
1844 rhaas 1802 ECB : char *sharedquery;
1803 : BTSpool *btspool;
1804 : BTSpool *btspool2;
1805 : BTShared *btshared;
1806 : Sharedsort *sharedsort;
1807 : Sharedsort *sharedsort2;
1808 : Relation heapRel;
1809 : Relation indexRel;
1810 : LOCKMODE heapLockmode;
1811 : LOCKMODE indexLockmode;
1812 : WalUsage *walusage;
1813 : BufferUsage *bufferusage;
1814 : int sortmem;
1815 :
1816 : #ifdef BTREE_BUILD_STATS
1817 : if (log_btree_build_stats)
1818 : ResetUsage();
1819 : #endif /* BTREE_BUILD_STATS */
1820 :
1821 : /*
1822 : * The only possible status flag that can be set to the parallel worker is
1823 : * PROC_IN_SAFE_IC.
1824 : */
506 akapila 1825 GIC 71 : Assert((MyProc->statusFlags == 0) ||
1826 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
506 akapila 1827 ECB :
1828 : /* Set debug_query_string for individual workers first */
890 noah 1829 GIC 71 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1844 rhaas 1830 71 : debug_query_string = sharedquery;
1844 rhaas 1831 ECB :
1832 : /* Report the query string from leader */
1844 rhaas 1833 GIC 71 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1834 :
1844 rhaas 1835 ECB : /* Look up nbtree shared state */
1892 rhaas 1836 GIC 71 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1837 :
1892 rhaas 1838 ECB : /* Open relations using lock modes known to be obtained by index.c */
1892 rhaas 1839 GIC 71 : if (!btshared->isconcurrent)
1840 : {
1892 rhaas 1841 CBC 71 : heapLockmode = ShareLock;
1892 rhaas 1842 GIC 71 : indexLockmode = AccessExclusiveLock;
1892 rhaas 1843 ECB : }
1844 : else
1845 : {
1892 rhaas 1846 UIC 0 : heapLockmode = ShareUpdateExclusiveLock;
1847 0 : indexLockmode = RowExclusiveLock;
1892 rhaas 1848 EUB : }
1849 :
1850 : /* Open relations within worker */
1539 andres 1851 GIC 71 : heapRel = table_open(btshared->heaprelid, heapLockmode);
1892 rhaas 1852 71 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1892 rhaas 1853 ECB :
1854 : /* Initialize worker's own spool */
1892 rhaas 1855 GIC 71 : btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1856 71 : btspool->heap = heapRel;
1892 rhaas 1857 CBC 71 : btspool->index = indexRel;
1858 71 : btspool->isunique = btshared->isunique;
430 peter 1859 71 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1892 rhaas 1860 ECB :
1861 : /* Look up shared state private to tuplesort.c */
1892 rhaas 1862 GIC 71 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1863 71 : tuplesort_attach_shared(sharedsort, seg);
1892 rhaas 1864 CBC 71 : if (!btshared->isunique)
1892 rhaas 1865 ECB : {
1892 rhaas 1866 CBC 39 : btspool2 = NULL;
1892 rhaas 1867 GIC 39 : sharedsort2 = NULL;
1892 rhaas 1868 ECB : }
1869 : else
1870 : {
1871 : /* Allocate memory for worker's own private secondary spool */
1892 rhaas 1872 GIC 32 : btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1873 :
1892 rhaas 1874 ECB : /* Initialize worker's own secondary spool */
1892 rhaas 1875 GIC 32 : btspool2->heap = btspool->heap;
1876 32 : btspool2->index = btspool->index;
1892 rhaas 1877 CBC 32 : btspool2->isunique = false;
1892 rhaas 1878 ECB : /* Look up shared state private to tuplesort.c */
1892 rhaas 1879 CBC 32 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1892 rhaas 1880 GIC 32 : tuplesort_attach_shared(sharedsort2, seg);
1892 rhaas 1881 ECB : }
1882 :
1883 : /* Prepare to track buffer usage during parallel execution */
1100 akapila 1884 GIC 71 : InstrStartParallelQuery();
1885 :
1892 rhaas 1886 ECB : /* Perform sorting of spool, and possibly a spool2 */
1892 rhaas 1887 GIC 71 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1888 71 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1468 alvherre 1889 ECB : sharedsort2, sortmem, false);
1892 rhaas 1890 :
1891 : /* Report WAL/buffer usage during parallel execution */
1095 akapila 1892 GIC 71 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1100 1893 71 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1095 akapila 1894 CBC 71 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1895 71 : &walusage[ParallelWorkerNumber]);
1100 akapila 1896 ECB :
1892 rhaas 1897 : #ifdef BTREE_BUILD_STATS
1898 : if (log_btree_build_stats)
1899 : {
1900 : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1901 : ResetUsage();
1902 : }
1903 : #endif /* BTREE_BUILD_STATS */
1904 :
1892 rhaas 1905 GIC 71 : index_close(indexRel, indexLockmode);
1539 andres 1906 71 : table_close(heapRel, heapLockmode);
1892 rhaas 1907 CBC 71 : }
1892 rhaas 1908 ECB :
1909 : /*
1910 : * Perform a worker's portion of a parallel sort.
1911 : *
1912 : * This generates a tuplesort for passed btspool, and a second tuplesort
1913 : * state if a second btspool is need (i.e. for unique index builds). All
1914 : * other spool fields should already be set when this is called.
1915 : *
1916 : * sortmem is the amount of working memory to use within each worker,
1917 : * expressed in KBs.
1918 : *
1919 : * When this returns, workers are done, and need only release resources.
1920 : */
1921 : static void
1892 rhaas 1922 GIC 142 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1923 : BTShared *btshared, Sharedsort *sharedsort,
1468 alvherre 1924 ECB : Sharedsort *sharedsort2, int sortmem, bool progress)
1925 : {
1926 : SortCoordinate coordinate;
1927 : BTBuildState buildstate;
1928 : TableScanDesc scan;
1929 : double reltuples;
1930 : IndexInfo *indexInfo;
1931 :
1932 : /* Initialize local tuplesort coordination state */
1892 rhaas 1933 GIC 142 : coordinate = palloc0(sizeof(SortCoordinateData));
1934 142 : coordinate->isWorker = true;
1892 rhaas 1935 CBC 142 : coordinate->nParticipants = -1;
1936 142 : coordinate->sharedsort = sharedsort;
1892 rhaas 1937 ECB :
1938 : /* Begin "partial" tuplesort */
1892 rhaas 1939 GIC 284 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1940 : btspool->index,
1892 rhaas 1941 CBC 142 : btspool->isunique,
430 peter 1942 GIC 142 : btspool->nulls_not_distinct,
1892 rhaas 1943 ECB : sortmem, coordinate,
370 drowley 1944 : TUPLESORT_NONE);
1945 :
1946 : /*
1947 : * Just as with serial case, there may be a second spool. If so, a
1948 : * second, dedicated spool2 partial tuplesort is required.
1949 : */
1892 rhaas 1950 GIC 142 : if (btspool2)
1951 : {
1892 rhaas 1952 ECB : SortCoordinate coordinate2;
1953 :
1954 : /*
1955 : * We expect that the second one (for dead tuples) won't get very
1956 : * full, so we give it only work_mem (unless sortmem is less for
1957 : * worker). Worker processes are generally permitted to allocate
1958 : * work_mem independently.
1959 : */
1892 rhaas 1960 GIC 64 : coordinate2 = palloc0(sizeof(SortCoordinateData));
1961 64 : coordinate2->isWorker = true;
1892 rhaas 1962 CBC 64 : coordinate2->nParticipants = -1;
1963 64 : coordinate2->sharedsort = sharedsort2;
1964 64 : btspool2->sortstate =
430 peter 1965 64 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1892 rhaas 1966 ECB : Min(sortmem, work_mem), coordinate2,
1967 : false);
1968 : }
1969 :
1970 : /* Fill in buildstate for _bt_build_callback() */
1892 rhaas 1971 GIC 142 : buildstate.isunique = btshared->isunique;
430 peter 1972 142 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1892 rhaas 1973 CBC 142 : buildstate.havedead = false;
1974 142 : buildstate.heap = btspool->heap;
1975 142 : buildstate.spool = btspool;
1976 142 : buildstate.spool2 = btspool2;
1977 142 : buildstate.indtuples = 0;
1978 142 : buildstate.btleader = NULL;
1892 rhaas 1979 ECB :
1980 : /* Join parallel scan */
1892 rhaas 1981 GIC 142 : indexInfo = BuildIndexInfo(btspool->index);
1982 142 : indexInfo->ii_Concurrent = btshared->isconcurrent;
1468 alvherre 1983 CBC 142 : scan = table_beginscan_parallel(btspool->heap,
1468 alvherre 1984 ECB : ParallelTableScanFromBTShared(btshared));
1474 andres 1985 CBC 142 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1986 : true, progress, _bt_build_callback,
1474 andres 1987 ECB : (void *) &buildstate, scan);
1988 :
1989 : /* Execute this worker's part of the sort */
667 alvherre 1990 GIC 142 : if (progress)
1991 71 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
667 alvherre 1992 ECB : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1892 rhaas 1993 CBC 142 : tuplesort_performsort(btspool->sortstate);
1892 rhaas 1994 GIC 142 : if (btspool2)
667 alvherre 1995 ECB : {
667 alvherre 1996 CBC 64 : if (progress)
667 alvherre 1997 GIC 32 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
667 alvherre 1998 ECB : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1892 rhaas 1999 CBC 64 : tuplesort_performsort(btspool2->sortstate);
2000 : }
1892 rhaas 2001 ECB :
2002 : /*
2003 : * Done. Record ambuild statistics, and whether we encountered a broken
2004 : * HOT chain.
2005 : */
1892 rhaas 2006 GIC 142 : SpinLockAcquire(&btshared->mutex);
2007 142 : btshared->nparticipantsdone++;
1892 rhaas 2008 CBC 142 : btshared->reltuples += reltuples;
2009 142 : if (buildstate.havedead)
1892 rhaas 2010 LBC 0 : btshared->havedead = true;
1892 rhaas 2011 CBC 142 : btshared->indtuples += buildstate.indtuples;
1892 rhaas 2012 GBC 142 : if (indexInfo->ii_BrokenHotChain)
1892 rhaas 2013 LBC 0 : btshared->brokenhotchain = true;
1892 rhaas 2014 CBC 142 : SpinLockRelease(&btshared->mutex);
1892 rhaas 2015 EUB :
1892 rhaas 2016 ECB : /* Notify leader */
1892 rhaas 2017 GIC 142 : ConditionVariableSignal(&btshared->workersdonecv);
2018 :
1892 rhaas 2019 ECB : /* We can end tuplesorts immediately */
1892 rhaas 2020 GIC 142 : tuplesort_end(btspool->sortstate);
2021 142 : if (btspool2)
1892 rhaas 2022 CBC 64 : tuplesort_end(btspool2->sortstate);
2023 142 : }
|