LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 97.1 % 580 563 7 6 4 2 255 7 299 11 251 6
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 22 22 14 3 5 14
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtsort.c
       4                 :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5                 :  *
       6                 :  * NOTES
       7                 :  *
       8                 :  * We use tuplesort.c to sort the given index tuples into order.
       9                 :  * Then we scan the index tuples in order and build the btree pages
      10                 :  * for each level.  We load source tuples into leaf-level pages.
      11                 :  * Whenever we fill a page at one level, we add a link to it to its
      12                 :  * parent level (starting a new parent level if necessary).  When
      13                 :  * done, we write out each final page on each level, adding it to
      14                 :  * its parent level.  When we have only one page on a level, it must be
      15                 :  * the root -- it can be attached to the btree metapage and we are done.
      16                 :  *
      17                 :  * It is not wise to pack the pages entirely full, since then *any*
      18                 :  * insertion would cause a split (and not only of the leaf page; the need
      19                 :  * for a split would cascade right up the tree).  The steady-state load
      20                 :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21                 :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22                 :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23                 :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24                 :  * cascading splits during early insertions.
      25                 :  *
      26                 :  * Formerly the index pages being built were kept in shared buffers, but
      27                 :  * that is of no value (since other backends have no interest in them yet)
      28                 :  * and it created locking problems for CHECKPOINT, because the upper-level
      29                 :  * pages were held exclusive-locked for long periods.  Now we just build
      30                 :  * the pages in local memory and smgrwrite or smgrextend them as we finish
      31                 :  * them.  They will need to be re-read into shared buffers on first use after
      32                 :  * the build finishes.
      33                 :  *
      34                 :  * This code isn't concerned about the FSM at all. The caller is responsible
      35                 :  * for initializing that.
      36                 :  *
      37                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
      38                 :  * Portions Copyright (c) 1994, Regents of the University of California
      39                 :  *
      40                 :  * IDENTIFICATION
      41                 :  *    src/backend/access/nbtree/nbtsort.c
      42                 :  *
      43                 :  *-------------------------------------------------------------------------
      44                 :  */
      45                 : 
      46                 : #include "postgres.h"
      47                 : 
      48                 : #include "access/nbtree.h"
      49                 : #include "access/parallel.h"
      50                 : #include "access/relscan.h"
      51                 : #include "access/table.h"
      52                 : #include "access/xact.h"
      53                 : #include "access/xlog.h"
      54                 : #include "access/xloginsert.h"
      55                 : #include "catalog/index.h"
      56                 : #include "commands/progress.h"
      57                 : #include "executor/instrument.h"
      58                 : #include "miscadmin.h"
      59                 : #include "pgstat.h"
      60                 : #include "storage/smgr.h"
      61                 : #include "tcop/tcopprot.h"        /* pgrminclude ignore */
      62                 : #include "utils/rel.h"
      63                 : #include "utils/sortsupport.h"
      64                 : #include "utils/tuplesort.h"
      65                 : 
      66                 : 
      67                 : /* Magic numbers for parallel state sharing */
      68                 : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      69                 : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      70                 : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      71                 : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      72                 : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      73                 : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      74                 : 
      75                 : /*
      76                 :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      77                 :  * parallel index builds.  This may be useful as a debugging aid.
      78                 : #undef DISABLE_LEADER_PARTICIPATION
      79                 :  */
      80                 : 
      81                 : /*
      82                 :  * Status record for spooling/sorting phase.  (Note we may have two of
      83                 :  * these due to the special requirements for uniqueness-checking with
      84                 :  * dead tuples.)
      85                 :  */
      86                 : typedef struct BTSpool
      87                 : {
      88                 :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      89                 :     Relation    heap;
      90                 :     Relation    index;
      91                 :     bool        isunique;
      92                 :     bool        nulls_not_distinct;
      93                 : } BTSpool;
      94                 : 
      95                 : /*
      96                 :  * Status for index builds performed in parallel.  This is allocated in a
      97                 :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      98                 :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      99                 :  */
     100                 : typedef struct BTShared
     101                 : {
     102                 :     /*
     103                 :      * These fields are not modified during the sort.  They primarily exist
     104                 :      * for the benefit of worker processes that need to create BTSpool state
     105                 :      * corresponding to that used by the leader.
     106                 :      */
     107                 :     Oid         heaprelid;
     108                 :     Oid         indexrelid;
     109                 :     bool        isunique;
     110                 :     bool        nulls_not_distinct;
     111                 :     bool        isconcurrent;
     112                 :     int         scantuplesortstates;
     113                 : 
     114                 :     /*
     115                 :      * workersdonecv is used to monitor the progress of workers.  All parallel
     116                 :      * participants must indicate that they are done before leader can use
     117                 :      * mutable state that workers maintain during scan (and before leader can
     118                 :      * proceed to tuplesort_performsort()).
     119                 :      */
     120                 :     ConditionVariable workersdonecv;
     121                 : 
     122                 :     /*
     123                 :      * mutex protects all fields before heapdesc.
     124                 :      *
     125                 :      * These fields contain status information of interest to B-Tree index
     126                 :      * builds that must work just the same when an index is built in parallel.
     127                 :      */
     128                 :     slock_t     mutex;
     129                 : 
     130                 :     /*
     131                 :      * Mutable state that is maintained by workers, and reported back to
     132                 :      * leader at end of parallel scan.
     133                 :      *
     134                 :      * nparticipantsdone is number of worker processes finished.
     135                 :      *
     136                 :      * reltuples is the total number of input heap tuples.
     137                 :      *
     138                 :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     139                 :      * build.
     140                 :      *
     141                 :      * indtuples is the total number of tuples that made it into the index.
     142                 :      *
     143                 :      * brokenhotchain indicates if any worker detected a broken HOT chain
     144                 :      * during build.
     145                 :      */
     146                 :     int         nparticipantsdone;
     147                 :     double      reltuples;
     148                 :     bool        havedead;
     149                 :     double      indtuples;
     150                 :     bool        brokenhotchain;
     151                 : 
     152                 :     /*
     153                 :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     154                 :      * implementations of the parallel table scan desc interface might need
     155                 :      * stronger alignment.
     156                 :      */
     157                 : } BTShared;
     158                 : 
     159                 : /*
     160                 :  * Return pointer to a BTShared's parallel table scan.
     161                 :  *
     162                 :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     163                 :  * MAXALIGN.
     164                 :  */
     165                 : #define ParallelTableScanFromBTShared(shared) \
     166                 :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     167                 : 
     168                 : /*
     169                 :  * Status for leader in parallel index build.
     170                 :  */
     171                 : typedef struct BTLeader
     172                 : {
     173                 :     /* parallel context itself */
     174                 :     ParallelContext *pcxt;
     175                 : 
     176                 :     /*
     177                 :      * nparticipanttuplesorts is the exact number of worker processes
     178                 :      * successfully launched, plus one leader process if it participates as a
     179                 :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     180                 :      * participating as a worker).
     181                 :      */
     182                 :     int         nparticipanttuplesorts;
     183                 : 
     184                 :     /*
     185                 :      * Leader process convenience pointers to shared state (leader avoids TOC
     186                 :      * lookups).
     187                 :      *
     188                 :      * btshared is the shared state for entire build.  sharedsort is the
     189                 :      * shared, tuplesort-managed state passed to each process tuplesort.
     190                 :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     191                 :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     192                 :      * an MVCC snapshot is required.
     193                 :      */
     194                 :     BTShared   *btshared;
     195                 :     Sharedsort *sharedsort;
     196                 :     Sharedsort *sharedsort2;
     197                 :     Snapshot    snapshot;
     198                 :     WalUsage   *walusage;
     199                 :     BufferUsage *bufferusage;
     200                 : } BTLeader;
     201                 : 
     202                 : /*
     203                 :  * Working state for btbuild and its callback.
     204                 :  *
     205                 :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     206                 :  * participant.
     207                 :  */
     208                 : typedef struct BTBuildState
     209                 : {
     210                 :     bool        isunique;
     211                 :     bool        nulls_not_distinct;
     212                 :     bool        havedead;
     213                 :     Relation    heap;
     214                 :     BTSpool    *spool;
     215                 : 
     216                 :     /*
     217                 :      * spool2 is needed only when the index is a unique index. Dead tuples are
     218                 :      * put into spool2 instead of spool in order to avoid uniqueness check.
     219                 :      */
     220                 :     BTSpool    *spool2;
     221                 :     double      indtuples;
     222                 : 
     223                 :     /*
     224                 :      * btleader is only present when a parallel index build is performed, and
     225                 :      * only in the leader process. (Actually, only the leader has a
     226                 :      * BTBuildState.  Workers have their own spool and spool2, though.)
     227                 :      */
     228                 :     BTLeader   *btleader;
     229                 : } BTBuildState;
     230                 : 
     231                 : /*
     232                 :  * Status record for a btree page being built.  We have one of these
     233                 :  * for each active tree level.
     234                 :  */
     235                 : typedef struct BTPageState
     236                 : {
     237                 :     Page        btps_page;      /* workspace for page building */
     238                 :     BlockNumber btps_blkno;     /* block # to write this page at */
     239                 :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     240                 :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     241                 :     Size        btps_lastextra; /* last item's extra posting list space */
     242                 :     uint32      btps_level;     /* tree level (0 = leaf) */
     243                 :     Size        btps_full;      /* "full" if less than this much free space */
     244                 :     struct BTPageState *btps_next;  /* link to parent level, if any */
     245                 : } BTPageState;
     246                 : 
     247                 : /*
     248                 :  * Overall status record for index writing phase.
     249                 :  */
     250                 : typedef struct BTWriteState
     251                 : {
     252                 :     Relation    heap;
     253                 :     Relation    index;
     254                 :     BTScanInsert inskey;        /* generic insertion scankey */
     255                 :     bool        btws_use_wal;   /* dump pages to WAL? */
     256                 :     BlockNumber btws_pages_alloced; /* # pages allocated */
     257                 :     BlockNumber btws_pages_written; /* # pages written out */
     258                 :     Page        btws_zeropage;  /* workspace for filling zeroes */
     259                 : } BTWriteState;
     260                 : 
     261                 : 
     262                 : static double _bt_spools_heapscan(Relation heap, Relation index,
     263                 :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     264                 : static void _bt_spooldestroy(BTSpool *btspool);
     265                 : static void _bt_spool(BTSpool *btspool, ItemPointer self,
     266                 :                       Datum *values, bool *isnull);
     267                 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     268                 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     269                 :                                bool *isnull, bool tupleIsAlive, void *state);
     270                 : static Page _bt_blnewpage(uint32 level);
     271                 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     272                 : static void _bt_slideleft(Page rightmostpage);
     273                 : static void _bt_sortaddtup(Page page, Size itemsize,
     274                 :                            IndexTuple itup, OffsetNumber itup_off,
     275                 :                            bool newfirstdataitem);
     276                 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     277                 :                          IndexTuple itup, Size truncextra);
     278                 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     279                 :                                           BTPageState *state,
     280                 :                                           BTDedupState dstate);
     281                 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     282                 : static void _bt_load(BTWriteState *wstate,
     283                 :                      BTSpool *btspool, BTSpool *btspool2);
     284                 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     285                 :                                int request);
     286                 : static void _bt_end_parallel(BTLeader *btleader);
     287                 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     288                 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     289                 :                                     bool *brokenhotchain);
     290                 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     291                 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     292                 :                                        BTShared *btshared, Sharedsort *sharedsort,
     293                 :                                        Sharedsort *sharedsort2, int sortmem,
     294                 :                                        bool progress);
     295                 : 
     296                 : 
     297                 : /*
     298                 :  *  btbuild() -- build a new btree index.
     299                 :  */
     300                 : IndexBuildResult *
     301 CBC       64158 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     302                 : {
     303                 :     IndexBuildResult *result;
     304                 :     BTBuildState buildstate;
     305                 :     double      reltuples;
     306                 : 
     307                 : #ifdef BTREE_BUILD_STATS
     308                 :     if (log_btree_build_stats)
     309                 :         ResetUsage();
     310                 : #endif                          /* BTREE_BUILD_STATS */
     311                 : 
     312           64158 :     buildstate.isunique = indexInfo->ii_Unique;
     313           64158 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     314           64158 :     buildstate.havedead = false;
     315           64158 :     buildstate.heap = heap;
     316           64158 :     buildstate.spool = NULL;
     317           64158 :     buildstate.spool2 = NULL;
     318           64158 :     buildstate.indtuples = 0;
     319           64158 :     buildstate.btleader = NULL;
     320                 : 
     321                 :     /*
     322                 :      * We expect to be called exactly once for any index relation. If that's
     323                 :      * not the case, big trouble's what we have.
     324                 :      */
     325           64158 :     if (RelationGetNumberOfBlocks(index) != 0)
     326 UBC           0 :         elog(ERROR, "index \"%s\" already contains data",
     327                 :              RelationGetRelationName(index));
     328                 : 
     329 CBC       64158 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     330                 : 
     331                 :     /*
     332                 :      * Finish the build by (1) completing the sort of the spool file, (2)
     333                 :      * inserting the sorted tuples into btree pages and (3) building the upper
     334                 :      * levels.  Finally, it may also be necessary to end use of parallelism.
     335                 :      */
     336           64155 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     337           64113 :     _bt_spooldestroy(buildstate.spool);
     338           64113 :     if (buildstate.spool2)
     339              13 :         _bt_spooldestroy(buildstate.spool2);
     340           64113 :     if (buildstate.btleader)
     341              71 :         _bt_end_parallel(buildstate.btleader);
     342                 : 
     343           64113 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     344                 : 
     345           64113 :     result->heap_tuples = reltuples;
     346           64113 :     result->index_tuples = buildstate.indtuples;
     347                 : 
     348                 : #ifdef BTREE_BUILD_STATS
     349                 :     if (log_btree_build_stats)
     350                 :     {
     351                 :         ShowUsage("BTREE BUILD STATS");
     352                 :         ResetUsage();
     353                 :     }
     354                 : #endif                          /* BTREE_BUILD_STATS */
     355                 : 
     356           64113 :     return result;
     357                 : }
     358                 : 
     359                 : /*
     360                 :  * Create and initialize one or two spool structures, and save them in caller's
     361                 :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     362                 :  * builds.
     363                 :  *
     364                 :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     365                 :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     366                 :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     367                 :  *
     368                 :  * Returns the total number of heap tuples scanned.
     369                 :  */
     370                 : static double
     371           64158 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     372                 :                     IndexInfo *indexInfo)
     373                 : {
     374           64158 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     375           64158 :     SortCoordinate coordinate = NULL;
     376           64158 :     double      reltuples = 0;
     377                 : 
     378                 :     /*
     379                 :      * We size the sort area as maintenance_work_mem rather than work_mem to
     380                 :      * speed index creation.  This should be OK since a single backend can't
     381                 :      * run multiple index creations in parallel (see also: notes on
     382                 :      * parallelism and maintenance_work_mem below).
     383                 :      */
     384           64158 :     btspool->heap = heap;
     385           64158 :     btspool->index = index;
     386           64158 :     btspool->isunique = indexInfo->ii_Unique;
     387           64158 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     388                 : 
     389                 :     /* Save as primary spool */
     390           64158 :     buildstate->spool = btspool;
     391                 : 
     392                 :     /* Report table scan phase started */
     393           64158 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     394                 :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     395                 : 
     396                 :     /* Attempt to launch parallel worker scan when required */
     397           64158 :     if (indexInfo->ii_ParallelWorkers > 0)
     398              71 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     399                 :                            indexInfo->ii_ParallelWorkers);
     400                 : 
     401                 :     /*
     402                 :      * If parallel build requested and at least one worker process was
     403                 :      * successfully launched, set up coordination state
     404                 :      */
     405           64158 :     if (buildstate->btleader)
     406                 :     {
     407              71 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     408              71 :         coordinate->isWorker = false;
     409              71 :         coordinate->nParticipants =
     410              71 :             buildstate->btleader->nparticipanttuplesorts;
     411              71 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     412                 :     }
     413                 : 
     414                 :     /*
     415                 :      * Begin serial/leader tuplesort.
     416                 :      *
     417                 :      * In cases where parallelism is involved, the leader receives the same
     418                 :      * share of maintenance_work_mem as a serial sort (it is generally treated
     419                 :      * in the same way as a serial sort once we return).  Parallel worker
     420                 :      * Tuplesortstates will have received only a fraction of
     421                 :      * maintenance_work_mem, though.
     422                 :      *
     423                 :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     424                 :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     425                 :      * some small overlap, but that's okay because we rely on leader
     426                 :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     427                 :      * When its tuplesort_performsort() is called (by our caller), and
     428                 :      * significant amounts of memory are likely to be used, all workers must
     429                 :      * have already freed almost all memory held by their Tuplesortstates
     430                 :      * (they are about to go away completely, too).  The overall effect is
     431                 :      * that maintenance_work_mem always represents an absolute high watermark
     432                 :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     433                 :      * the use of parallelism or any other factor.
     434                 :      */
     435          128316 :     buildstate->spool->sortstate =
     436           64158 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     437           64158 :                                     buildstate->nulls_not_distinct,
     438                 :                                     maintenance_work_mem, coordinate,
     439                 :                                     TUPLESORT_NONE);
     440                 : 
     441                 :     /*
     442                 :      * If building a unique index, put dead tuples in a second spool to keep
     443                 :      * them out of the uniqueness check.  We expect that the second spool (for
     444                 :      * dead tuples) won't get very full, so we give it only work_mem.
     445                 :      */
     446           64158 :     if (indexInfo->ii_Unique)
     447                 :     {
     448           56640 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     449           56640 :         SortCoordinate coordinate2 = NULL;
     450                 : 
     451                 :         /* Initialize secondary spool */
     452           56640 :         btspool2->heap = heap;
     453           56640 :         btspool2->index = index;
     454           56640 :         btspool2->isunique = false;
     455                 :         /* Save as secondary spool */
     456           56640 :         buildstate->spool2 = btspool2;
     457                 : 
     458           56640 :         if (buildstate->btleader)
     459                 :         {
     460                 :             /*
     461                 :              * Set up non-private state that is passed to
     462                 :              * tuplesort_begin_index_btree() about the basic high level
     463                 :              * coordination of a parallel sort.
     464                 :              */
     465              32 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     466              32 :             coordinate2->isWorker = false;
     467              32 :             coordinate2->nParticipants =
     468              32 :                 buildstate->btleader->nparticipanttuplesorts;
     469              32 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     470                 :         }
     471                 : 
     472                 :         /*
     473                 :          * We expect that the second one (for dead tuples) won't get very
     474                 :          * full, so we give it only work_mem
     475                 :          */
     476           56640 :         buildstate->spool2->sortstate =
     477           56640 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     478                 :                                         coordinate2, TUPLESORT_NONE);
     479                 :     }
     480                 : 
     481                 :     /* Fill spool using either serial or parallel heap scan */
     482           64158 :     if (!buildstate->btleader)
     483           64087 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     484                 :                                            _bt_build_callback, (void *) buildstate,
     485                 :                                            NULL);
     486                 :     else
     487              71 :         reltuples = _bt_parallel_heapscan(buildstate,
     488                 :                                           &indexInfo->ii_BrokenHotChain);
     489                 : 
     490                 :     /*
     491                 :      * Set the progress target for the next phase.  Reset the block number
     492                 :      * values set by table_index_build_scan
     493                 :      */
     494                 :     {
     495           64155 :         const int   progress_index[] = {
     496                 :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     497                 :             PROGRESS_SCAN_BLOCKS_TOTAL,
     498                 :             PROGRESS_SCAN_BLOCKS_DONE
     499                 :         };
     500           64155 :         const int64 progress_vals[] = {
     501           64155 :             buildstate->indtuples,
     502                 :             0, 0
     503                 :         };
     504                 : 
     505           64155 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     506                 :     }
     507                 : 
     508                 :     /* okay, all heap tuples are spooled */
     509           64155 :     if (buildstate->spool2 && !buildstate->havedead)
     510                 :     {
     511                 :         /* spool2 turns out to be unnecessary */
     512           56627 :         _bt_spooldestroy(buildstate->spool2);
     513           56627 :         buildstate->spool2 = NULL;
     514                 :     }
     515                 : 
     516           64155 :     return reltuples;
     517                 : }
     518                 : 
     519                 : /*
     520                 :  * clean up a spool structure and its substructures.
     521                 :  */
     522                 : static void
     523          120753 : _bt_spooldestroy(BTSpool *btspool)
     524                 : {
     525          120753 :     tuplesort_end(btspool->sortstate);
     526          120753 :     pfree(btspool);
     527          120753 : }
     528                 : 
     529                 : /*
     530                 :  * spool an index entry into the sort file.
     531                 :  */
     532                 : static void
     533        11769706 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
     534                 : {
     535        11769706 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     536                 :                                   self, values, isnull);
     537        11769706 : }
     538                 : 
     539                 : /*
     540                 :  * given a spool loaded by successive calls to _bt_spool,
     541                 :  * create an entire btree.
     542                 :  */
     543                 : static void
     544           64155 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     545                 : {
     546                 :     BTWriteState wstate;
     547                 : 
     548                 : #ifdef BTREE_BUILD_STATS
     549                 :     if (log_btree_build_stats)
     550                 :     {
     551                 :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     552                 :         ResetUsage();
     553                 :     }
     554                 : #endif                          /* BTREE_BUILD_STATS */
     555                 : 
     556                 :     /* Execute the sort */
     557           64155 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     558                 :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     559           64155 :     tuplesort_performsort(btspool->sortstate);
     560           64113 :     if (btspool2)
     561                 :     {
     562              13 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     563                 :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     564              13 :         tuplesort_performsort(btspool2->sortstate);
     565                 :     }
     566                 : 
     567           64113 :     wstate.heap = btspool->heap;
     568           64113 :     wstate.index = btspool->index;
     569 GNC       64113 :     wstate.inskey = _bt_mkscankey(wstate.index, btspool->heap, NULL);
     570                 :     /* _bt_mkscankey() won't set allequalimage without metapage */
     571 CBC       64113 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     572           64113 :     wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
     573                 : 
     574                 :     /* reserve the metapage */
     575           64113 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     576           64113 :     wstate.btws_pages_written = 0;
     577           64113 :     wstate.btws_zeropage = NULL;    /* until needed */
     578                 : 
     579           64113 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     580                 :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     581           64113 :     _bt_load(&wstate, btspool, btspool2);
     582           64113 : }
     583                 : 
     584                 : /*
     585                 :  * Per-tuple callback for table_index_build_scan
     586                 :  */
     587                 : static void
     588        11769706 : _bt_build_callback(Relation index,
     589                 :                    ItemPointer tid,
     590                 :                    Datum *values,
     591                 :                    bool *isnull,
     592                 :                    bool tupleIsAlive,
     593                 :                    void *state)
     594                 : {
     595        11769706 :     BTBuildState *buildstate = (BTBuildState *) state;
     596                 : 
     597                 :     /*
     598                 :      * insert the index tuple into the appropriate spool file for subsequent
     599                 :      * processing
     600                 :      */
     601        11769706 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     602        11769481 :         _bt_spool(buildstate->spool, tid, values, isnull);
     603                 :     else
     604                 :     {
     605                 :         /* dead tuples are put into spool2 */
     606             225 :         buildstate->havedead = true;
     607             225 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     608                 :     }
     609                 : 
     610        11769706 :     buildstate->indtuples += 1;
     611        11769706 : }
     612                 : 
     613                 : /*
     614                 :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     615                 :  */
     616                 : static Page
     617           62011 : _bt_blnewpage(uint32 level)
     618                 : {
     619                 :     Page        page;
     620                 :     BTPageOpaque opaque;
     621                 : 
     622 GNC       62011 :     page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
     623                 : 
     624                 :     /* Zero the page and set up standard page header info */
     625 CBC       62011 :     _bt_pageinit(page, BLCKSZ);
     626                 : 
     627                 :     /* Initialize BT opaque state */
     628           62011 :     opaque = BTPageGetOpaque(page);
     629           62011 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     630           62011 :     opaque->btpo_level = level;
     631           62011 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     632           62011 :     opaque->btpo_cycleid = 0;
     633                 : 
     634                 :     /* Make the P_HIKEY line pointer appear allocated */
     635           62011 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     636                 : 
     637           62011 :     return page;
     638                 : }
     639                 : 
     640                 : /*
     641                 :  * emit a completed btree page, and release the working storage.
     642                 :  */
     643                 : static void
     644          126124 : _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
     645                 : {
     646                 :     /* XLOG stuff */
     647          126124 :     if (wstate->btws_use_wal)
     648                 :     {
     649                 :         /* We use the XLOG_FPI record type for this */
     650 GNC      113213 :         log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
     651                 :     }
     652                 : 
     653                 :     /*
     654                 :      * If we have to write pages nonsequentially, fill in the space with
     655                 :      * zeroes until we come back and overwrite.  This is not logically
     656                 :      * necessary on standard Unix filesystems (unwritten space will read as
     657                 :      * zeroes anyway), but it should help to avoid fragmentation. The dummy
     658                 :      * pages aren't WAL-logged though.
     659                 :      */
     660 CBC      148363 :     while (blkno > wstate->btws_pages_written)
     661                 :     {
     662           22239 :         if (!wstate->btws_zeropage)
     663 GNC       18501 :             wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
     664                 :                                                           PG_IO_ALIGN_SIZE,
     665                 :                                                           MCXT_ALLOC_ZERO);
     666                 :         /* don't set checksum for all-zero page */
     667 GIC       22239 :         smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
     668           22239 :                    wstate->btws_pages_written++,
     669 GNC       22239 :                    wstate->btws_zeropage,
     670 ECB             :                    true);
     671                 :     }
     672                 : 
     673 GIC      126124 :     PageSetChecksumInplace(page, blkno);
     674                 : 
     675 ECB             :     /*
     676                 :      * Now write the page.  There's no need for smgr to schedule an fsync for
     677                 :      * this write; we'll do it ourselves before ending the build.
     678                 :      */
     679 GIC      126124 :     if (blkno == wstate->btws_pages_written)
     680                 :     {
     681 ECB             :         /* extending the file... */
     682 GIC      103885 :         smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
     683                 :                    page, true);
     684 CBC      103885 :         wstate->btws_pages_written++;
     685                 :     }
     686 ECB             :     else
     687                 :     {
     688                 :         /* overwriting a block we zero-filled before */
     689 GIC       22239 :         smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
     690                 :                   page, true);
     691 ECB             :     }
     692                 : 
     693 GIC      126124 :     pfree(page);
     694          126124 : }
     695 ECB             : 
     696                 : /*
     697                 :  * allocate and initialize a new BTPageState.  the returned structure
     698                 :  * is suitable for immediate use by _bt_buildadd.
     699                 :  */
     700                 : static BTPageState *
     701 GIC       23225 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     702                 : {
     703 CBC       23225 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     704                 : 
     705 ECB             :     /* create initial page for level */
     706 GIC       23225 :     state->btps_page = _bt_blnewpage(level);
     707                 : 
     708 ECB             :     /* and assign it a page position */
     709 GIC       23225 :     state->btps_blkno = wstate->btws_pages_alloced++;
     710                 : 
     711 CBC       23225 :     state->btps_lowkey = NULL;
     712                 :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     713           23225 :     state->btps_lastoff = P_HIKEY;
     714 GIC       23225 :     state->btps_lastextra = 0;
     715 CBC       23225 :     state->btps_level = level;
     716 ECB             :     /* set "full" threshold based on level.  See notes at head of file. */
     717 CBC       23225 :     if (level > 0)
     718 GIC        4724 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     719 ECB             :     else
     720 CBC       18501 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     721                 : 
     722 ECB             :     /* no parent level, yet */
     723 GIC       23225 :     state->btps_next = NULL;
     724                 : 
     725 CBC       23225 :     return state;
     726                 : }
     727 ECB             : 
     728                 : /*
     729                 :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     730                 :  * P_HIKEY, overwriting P_HIKEY).
     731                 :  *
     732                 :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     733                 :  * rightmost page on its level is not supposed to get a high key.  Now that
     734                 :  * it's clear that this page is a rightmost page, remove the unneeded empty
     735                 :  * P_HIKEY line pointer space.
     736                 :  */
     737                 : static void
     738 GIC       23225 : _bt_slideleft(Page rightmostpage)
     739                 : {
     740 ECB             :     OffsetNumber off;
     741                 :     OffsetNumber maxoff;
     742                 :     ItemId      previi;
     743                 : 
     744 GIC       23225 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     745           23225 :     Assert(maxoff >= P_FIRSTKEY);
     746 CBC       23225 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     747         1620941 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     748 ECB             :     {
     749 CBC     1597716 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     750                 : 
     751         1597716 :         *previi = *thisii;
     752 GIC     1597716 :         previi = thisii;
     753 ECB             :     }
     754 CBC       23225 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     755 GIC       23225 : }
     756 ECB             : 
     757                 : /*
     758                 :  * Add an item to a page being built.
     759                 :  *
     760                 :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     761                 :  * raises an error directly.
     762                 :  *
     763                 :  * Note that our nbtsort.c caller does not know yet if the page will be
     764                 :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     765                 :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     766                 :  * calling _bt_slideleft().
     767                 :  */
     768                 : static void
     769 GIC    11189149 : _bt_sortaddtup(Page page,
     770                 :                Size itemsize,
     771 ECB             :                IndexTuple itup,
     772                 :                OffsetNumber itup_off,
     773                 :                bool newfirstdataitem)
     774                 : {
     775                 :     IndexTupleData trunctuple;
     776                 : 
     777 GIC    11189149 :     if (newfirstdataitem)
     778                 :     {
     779 CBC        4786 :         trunctuple = *itup;
     780 GIC        4786 :         trunctuple.t_info = sizeof(IndexTupleData);
     781 CBC        4786 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     782            4786 :         itup = &trunctuple;
     783            4786 :         itemsize = sizeof(IndexTupleData);
     784 ECB             :     }
     785                 : 
     786 GIC    11189149 :     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
     787                 :                     false, false) == InvalidOffsetNumber)
     788 LBC           0 :         elog(ERROR, "failed to add item to the index page");
     789 GIC    11189149 : }
     790 EUB             : 
     791 ECB             : /*----------
     792                 :  * Add an item to a disk page from the sort output (or add a posting list
     793                 :  * item formed from the sort output).
     794                 :  *
     795                 :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     796                 :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     797                 :  * - on non-leaf pages, the key portion of the first item need not be
     798                 :  *   stored, we should store only the link.
     799                 :  *
     800                 :  * A leaf page being built looks like:
     801                 :  *
     802                 :  * +----------------+---------------------------------+
     803                 :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     804                 :  * +-----------+----+---------------------------------+
     805                 :  * | ... linpN |                                      |
     806                 :  * +-----------+--------------------------------------+
     807                 :  * |     ^ last                                       |
     808                 :  * |                                                  |
     809                 :  * +-------------+------------------------------------+
     810                 :  * |             | itemN ...                          |
     811                 :  * +-------------+------------------+-----------------+
     812                 :  * |          ... item3 item2 item1 | "special space" |
     813                 :  * +--------------------------------+-----------------+
     814                 :  *
     815                 :  * Contrast this with the diagram in bufpage.h; note the mismatch
     816                 :  * between linps and items.  This is because we reserve linp0 as a
     817                 :  * placeholder for the pointer to the "high key" item; when we have
     818                 :  * filled up the page, we will set linp0 to point to itemN and clear
     819                 :  * linpN.  On the other hand, if we find this is the last (rightmost)
     820                 :  * page, we leave the items alone and slide the linp array over.  If
     821                 :  * the high key is to be truncated, offset 1 is deleted, and we insert
     822                 :  * the truncated high key at offset 1.
     823                 :  *
     824                 :  * 'last' pointer indicates the last offset added to the page.
     825                 :  *
     826                 :  * 'truncextra' is the size of the posting list in itup, if any.  This
     827                 :  * information is stashed for the next call here, when we may benefit
     828                 :  * from considering the impact of truncating away the posting list on
     829                 :  * the page before deciding to finish the page off.  Posting lists are
     830                 :  * often relatively large, so it is worth going to the trouble of
     831                 :  * accounting for the saving from truncating away the posting list of
     832                 :  * the tuple that becomes the high key (that may be the only way to
     833                 :  * get close to target free space on the page).  Note that this is
     834                 :  * only used for the soft fillfactor-wise limit, not the critical hard
     835                 :  * limit.
     836                 :  *----------
     837                 :  */
     838                 : static void
     839 GIC    11150363 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     840                 :              Size truncextra)
     841 ECB             : {
     842                 :     Page        npage;
     843                 :     BlockNumber nblkno;
     844                 :     OffsetNumber last_off;
     845                 :     Size        last_truncextra;
     846                 :     Size        pgspc;
     847                 :     Size        itupsz;
     848                 :     bool        isleaf;
     849                 : 
     850                 :     /*
     851                 :      * This is a handy place to check for cancel interrupts during the btree
     852                 :      * load phase of index creation.
     853                 :      */
     854 GIC    11150363 :     CHECK_FOR_INTERRUPTS();
     855                 : 
     856 CBC    11150363 :     npage = state->btps_page;
     857 GIC    11150363 :     nblkno = state->btps_blkno;
     858 CBC    11150363 :     last_off = state->btps_lastoff;
     859        11150363 :     last_truncextra = state->btps_lastextra;
     860        11150363 :     state->btps_lastextra = truncextra;
     861 ECB             : 
     862 CBC    11150363 :     pgspc = PageGetFreeSpace(npage);
     863 GIC    11150363 :     itupsz = IndexTupleSize(itup);
     864 CBC    11150363 :     itupsz = MAXALIGN(itupsz);
     865 ECB             :     /* Leaf case has slightly different rules due to suffix truncation */
     866 CBC    11150363 :     isleaf = (state->btps_level == 0);
     867                 : 
     868 ECB             :     /*
     869                 :      * Check whether the new item can fit on a btree page on current level at
     870                 :      * all.
     871                 :      *
     872                 :      * Every newly built index will treat heap TID as part of the keyspace,
     873                 :      * which imposes the requirement that new high keys must occasionally have
     874                 :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     875                 :      * tuple one or two MAXALIGN() quantums larger than the original
     876                 :      * firstright tuple it's derived from.  v4 deals with the problem by
     877                 :      * decreasing the limit on the size of tuples inserted on the leaf level
     878                 :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     879                 :      * and the old limit on internal levels, since pivot tuples may need to
     880                 :      * make use of the reserved space.  This should never fail on internal
     881                 :      * pages.
     882                 :      */
     883 GIC    11150363 :     if (unlikely(itupsz > BTMaxItemSize(npage)))
     884             132 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     885 ECB             :                              itup);
     886                 : 
     887                 :     /*
     888                 :      * Check to see if current page will fit new item, with space left over to
     889                 :      * append a heap TID during suffix truncation when page is a leaf page.
     890                 :      *
     891                 :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     892                 :      * high key with heap TID when finishing off a leaf page, since we rely on
     893                 :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     894                 :      * internal pages we can always fit 3 pivot tuples with larger internal
     895                 :      * page tuple limit (includes page high key).
     896                 :      *
     897                 :      * Most of the time, a page is only "full" in the sense that the soft
     898                 :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     899                 :      * at least two items plus a high key on each page before starting a new
     900                 :      * page.  Disregard fillfactor and insert on "full" current page if we
     901                 :      * don't have the minimum number of items yet.  (Note that we deliberately
     902                 :      * assume that suffix truncation neither enlarges nor shrinks new high key
     903                 :      * when applying soft limit, except when last tuple has a posting list.)
     904                 :      */
     905 GIC    11150363 :     Assert(last_truncextra == 0 || isleaf);
     906        11150363 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     907 CBC    11149799 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     908 ECB             :     {
     909                 :         /*
     910                 :          * Finish off the page and write it out.
     911                 :          */
     912 GIC       38786 :         Page        opage = npage;
     913           38786 :         BlockNumber oblkno = nblkno;
     914 ECB             :         ItemId      ii;
     915                 :         ItemId      hii;
     916                 :         IndexTuple  oitup;
     917                 : 
     918                 :         /* Create new page of same level */
     919 GIC       38786 :         npage = _bt_blnewpage(state->btps_level);
     920                 : 
     921 ECB             :         /* and assign it a page position */
     922 GIC       38786 :         nblkno = wstate->btws_pages_alloced++;
     923                 : 
     924 ECB             :         /*
     925                 :          * We copy the last item on the page into the new page, and then
     926                 :          * rearrange the old page so that the 'last item' becomes its high key
     927                 :          * rather than a true data item.  There had better be at least two
     928                 :          * items on the page already, else the page would be empty of useful
     929                 :          * data.
     930                 :          */
     931 GIC       38786 :         Assert(last_off > P_FIRSTKEY);
     932           38786 :         ii = PageGetItemId(opage, last_off);
     933 CBC       38786 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     934           38786 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     935           38786 :                        !isleaf);
     936 ECB             : 
     937                 :         /*
     938                 :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     939                 :          * allocated empty space for a line pointer when opage was first
     940                 :          * created, so this is a matter of rearranging already-allocated space
     941                 :          * on page, and initializing high key line pointer. (Actually, leaf
     942                 :          * pages must also swap oitup with a truncated version of oitup, which
     943                 :          * is sometimes larger than oitup, though never by more than the space
     944                 :          * needed to append a heap TID.)
     945                 :          */
     946 GIC       38786 :         hii = PageGetItemId(opage, P_HIKEY);
     947           38786 :         *hii = *ii;
     948 CBC       38786 :         ItemIdSetUnused(ii);    /* redundant */
     949           38786 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     950 ECB             : 
     951 CBC       38786 :         if (isleaf)
     952                 :         {
     953 ECB             :             IndexTuple  lastleft;
     954                 :             IndexTuple  truncated;
     955                 : 
     956                 :             /*
     957                 :              * Truncate away any unneeded attributes from high key on leaf
     958                 :              * level.  This is only done at the leaf level because downlinks
     959                 :              * in internal pages are either negative infinity items, or get
     960                 :              * their contents from copying from one level down.  See also:
     961                 :              * _bt_split().
     962                 :              *
     963                 :              * We don't try to bias our choice of split point to make it more
     964                 :              * likely that _bt_truncate() can truncate away more attributes,
     965                 :              * whereas the split point used within _bt_split() is chosen much
     966                 :              * more delicately.  Even still, the lastleft and firstright
     967                 :              * tuples passed to _bt_truncate() here are at least not fully
     968                 :              * equal to each other when deduplication is used, unless there is
     969                 :              * a large group of duplicates (also, unique index builds usually
     970                 :              * have few or no spool2 duplicates).  When the split point is
     971                 :              * between two unequal tuples, _bt_truncate() will avoid including
     972                 :              * a heap TID in the new high key, which is the most important
     973                 :              * benefit of suffix truncation.
     974                 :              *
     975                 :              * Overwrite the old item with new truncated high key directly.
     976                 :              * oitup is already located at the physical beginning of tuple
     977                 :              * space, so this should directly reuse the existing tuple space.
     978                 :              */
     979 GIC       38724 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     980           38724 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     981 ECB             : 
     982 CBC       38724 :             Assert(IndexTupleSize(oitup) > last_truncextra);
     983 GIC       38724 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     984 ECB             :                                      wstate->inskey);
     985 CBC       38724 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
     986 GIC       38724 :                                          IndexTupleSize(truncated)))
     987 LBC           0 :                 elog(ERROR, "failed to add high key to the index page");
     988 CBC       38724 :             pfree(truncated);
     989 EUB             : 
     990 ECB             :             /* oitup should continue to point to the page's high key */
     991 GIC       38724 :             hii = PageGetItemId(opage, P_HIKEY);
     992           38724 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     993 ECB             :         }
     994                 : 
     995                 :         /*
     996                 :          * Link the old page into its parent, using its low key.  If we don't
     997                 :          * have a parent, we have to create one; this adds a new btree level.
     998                 :          */
     999 GIC       38786 :         if (state->btps_next == NULL)
    1000            4724 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
    1001 ECB             : 
    1002 CBC       38786 :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
    1003                 :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1004 ECB             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
    1005                 :                P_LEFTMOST(BTPageGetOpaque(opage)));
    1006 GIC       38786 :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
    1007                 :                !P_LEFTMOST(BTPageGetOpaque(opage)));
    1008 CBC       38786 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
    1009 GIC       38786 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
    1010 CBC       38786 :         pfree(state->btps_lowkey);
    1011 ECB             : 
    1012                 :         /*
    1013                 :          * Save a copy of the high key from the old page.  It is also the low
    1014                 :          * key for the new page.
    1015                 :          */
    1016 GIC       38786 :         state->btps_lowkey = CopyIndexTuple(oitup);
    1017                 : 
    1018 ECB             :         /*
    1019                 :          * Set the sibling links for both pages.
    1020                 :          */
    1021                 :         {
    1022 GIC       38786 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
    1023           38786 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
    1024 ECB             : 
    1025 CBC       38786 :             oopaque->btpo_next = nblkno;
    1026 GIC       38786 :             nopaque->btpo_prev = oblkno;
    1027 CBC       38786 :             nopaque->btpo_next = P_NONE; /* redundant */
    1028 ECB             :         }
    1029                 : 
    1030                 :         /*
    1031                 :          * Write out the old page.  We never need to touch it again, so we can
    1032                 :          * free the opage workspace too.
    1033                 :          */
    1034 GIC       38786 :         _bt_blwritepage(wstate, opage, oblkno);
    1035                 : 
    1036 ECB             :         /*
    1037                 :          * Reset last_off to point to new page
    1038                 :          */
    1039 GIC       38786 :         last_off = P_FIRSTKEY;
    1040                 :     }
    1041 ECB             : 
    1042                 :     /*
    1043                 :      * By here, either original page is still the current page, or a new page
    1044                 :      * was created that became the current page.  Either way, the current page
    1045                 :      * definitely has space for new item.
    1046                 :      *
    1047                 :      * If the new item is the first for its page, it must also be the first
    1048                 :      * item on its entire level.  On later same-level pages, a low key for a
    1049                 :      * page will be copied from the prior page in the code above.  Generate a
    1050                 :      * minus infinity low key here instead.
    1051                 :      */
    1052 GIC    11150363 :     if (last_off == P_HIKEY)
    1053                 :     {
    1054 CBC       23225 :         Assert(state->btps_lowkey == NULL);
    1055 GIC       23225 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1056 CBC       23225 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1057           23225 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1058 ECB             :     }
    1059                 : 
    1060                 :     /*
    1061                 :      * Add the new item into the current page.
    1062                 :      */
    1063 GIC    11150363 :     last_off = OffsetNumberNext(last_off);
    1064        11150363 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1065 CBC    11150363 :                    !isleaf && last_off == P_FIRSTKEY);
    1066 ECB             : 
    1067 CBC    11150363 :     state->btps_page = npage;
    1068 GIC    11150363 :     state->btps_blkno = nblkno;
    1069 CBC    11150363 :     state->btps_lastoff = last_off;
    1070        11150363 : }
    1071 ECB             : 
    1072                 : /*
    1073                 :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1074                 :  * is based on saved base tuple, and saved list of heap TIDs.
    1075                 :  *
    1076                 :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1077                 :  * using _bt_buildadd().
    1078                 :  */
    1079                 : static void
    1080 GIC     2402111 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1081                 :                               BTDedupState dstate)
    1082 ECB             : {
    1083 GIC     2402111 :     Assert(dstate->nitems > 0);
    1084                 : 
    1085 CBC     2402111 :     if (dstate->nitems == 1)
    1086 GIC     2372355 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1087 ECB             :     else
    1088                 :     {
    1089                 :         IndexTuple  postingtuple;
    1090                 :         Size        truncextra;
    1091                 : 
    1092                 :         /* form a tuple with a posting list */
    1093 GIC       29756 :         postingtuple = _bt_form_posting(dstate->base,
    1094                 :                                         dstate->htids,
    1095 ECB             :                                         dstate->nhtids);
    1096                 :         /* Calculate posting list overhead */
    1097 GIC       59512 :         truncextra = IndexTupleSize(postingtuple) -
    1098           29756 :             BTreeTupleGetPostingOffset(postingtuple);
    1099 ECB             : 
    1100 CBC       29756 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1101 GIC       29756 :         pfree(postingtuple);
    1102 ECB             :     }
    1103                 : 
    1104 GIC     2402111 :     dstate->nmaxitems = 0;
    1105         2402111 :     dstate->nhtids = 0;
    1106 CBC     2402111 :     dstate->nitems = 0;
    1107         2402111 :     dstate->phystupsize = 0;
    1108         2402111 : }
    1109 ECB             : 
    1110                 : /*
    1111                 :  * Finish writing out the completed btree.
    1112                 :  */
    1113                 : static void
    1114 GIC       64113 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1115                 : {
    1116 ECB             :     BTPageState *s;
    1117 GIC       64113 :     BlockNumber rootblkno = P_NONE;
    1118           64113 :     uint32      rootlevel = 0;
    1119 ECB             :     Page        metapage;
    1120                 : 
    1121                 :     /*
    1122                 :      * Each iteration of this loop completes one more level of the tree.
    1123                 :      */
    1124 GIC       87338 :     for (s = state; s != NULL; s = s->btps_next)
    1125                 :     {
    1126 ECB             :         BlockNumber blkno;
    1127                 :         BTPageOpaque opaque;
    1128                 : 
    1129 GIC       23225 :         blkno = s->btps_blkno;
    1130           23225 :         opaque = BTPageGetOpaque(s->btps_page);
    1131 ECB             : 
    1132                 :         /*
    1133                 :          * We have to link the last page on this level to somewhere.
    1134                 :          *
    1135                 :          * If we're at the top, it's the root, so attach it to the metapage.
    1136                 :          * Otherwise, add an entry for it to its parent using its low key.
    1137                 :          * This may cause the last page of the parent level to split, but
    1138                 :          * that's not a problem -- we haven't gotten to it yet.
    1139                 :          */
    1140 GIC       23225 :         if (s->btps_next == NULL)
    1141                 :         {
    1142 CBC       18501 :             opaque->btpo_flags |= BTP_ROOT;
    1143 GIC       18501 :             rootblkno = blkno;
    1144 CBC       18501 :             rootlevel = s->btps_level;
    1145 ECB             :         }
    1146                 :         else
    1147                 :         {
    1148 GIC        4724 :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1149                 :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1150 ECB             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1151                 :                    P_LEFTMOST(opaque));
    1152 GIC        4724 :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1153                 :                    !P_LEFTMOST(opaque));
    1154 CBC        4724 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1155 GIC        4724 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1156 CBC        4724 :             pfree(s->btps_lowkey);
    1157            4724 :             s->btps_lowkey = NULL;
    1158 ECB             :         }
    1159                 : 
    1160                 :         /*
    1161                 :          * This is the rightmost page, so the ItemId array needs to be slid
    1162                 :          * back one slot.  Then we can dump out the page.
    1163                 :          */
    1164 GIC       23225 :         _bt_slideleft(s->btps_page);
    1165           23225 :         _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
    1166 CBC       23225 :         s->btps_page = NULL; /* writepage freed the workspace */
    1167 ECB             :     }
    1168                 : 
    1169                 :     /*
    1170                 :      * As the last step in the process, construct the metapage and make it
    1171                 :      * point to the new root (unless we had no data at all, in which case it's
    1172                 :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1173                 :      * by filling in a valid magic number in the metapage.
    1174                 :      */
    1175 GNC       64113 :     metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
    1176 GIC       64113 :     _bt_initmetapage(metapage, rootblkno, rootlevel,
    1177 CBC       64113 :                      wstate->inskey->allequalimage);
    1178           64113 :     _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
    1179           64113 : }
    1180 ECB             : 
    1181                 : /*
    1182                 :  * Read tuples in correct sort order from tuplesort, and load them into
    1183                 :  * btree leaves.
    1184                 :  */
    1185                 : static void
    1186 GIC       64113 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1187                 : {
    1188 CBC       64113 :     BTPageState *state = NULL;
    1189 GIC       64113 :     bool        merge = (btspool2 != NULL);
    1190 ECB             :     IndexTuple  itup,
    1191 CBC       64113 :                 itup2 = NULL;
    1192                 :     bool        load1;
    1193           64113 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1194                 :     int         i,
    1195           64113 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1196                 :     SortSupport sortKeys;
    1197           64113 :     int64       tuples_done = 0;
    1198                 :     bool        deduplicate;
    1199 ECB             : 
    1200 GIC       71550 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1201            7437 :         BTGetDeduplicateItems(wstate->index);
    1202 ECB             : 
    1203 CBC       64113 :     if (merge)
    1204                 :     {
    1205 ECB             :         /*
    1206                 :          * Another BTSpool for dead tuples exists. Now we have to merge
    1207                 :          * btspool and btspool2.
    1208                 :          */
    1209                 : 
    1210                 :         /* the preparation of merge */
    1211 GIC          13 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1212              13 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1213 ECB             : 
    1214                 :         /* Prepare SortSupport data for each column */
    1215 GIC          13 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1216                 : 
    1217 CBC          27 :         for (i = 0; i < keysz; i++)
    1218                 :         {
    1219              14 :             SortSupport sortKey = sortKeys + i;
    1220 GIC          14 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1221 ECB             :             int16       strategy;
    1222                 : 
    1223 GIC          14 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1224              14 :             sortKey->ssup_collation = scanKey->sk_collation;
    1225 CBC          14 :             sortKey->ssup_nulls_first =
    1226              14 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1227              14 :             sortKey->ssup_attno = scanKey->sk_attno;
    1228 ECB             :             /* Abbreviation is not supported here */
    1229 CBC          14 :             sortKey->abbreviate = false;
    1230                 : 
    1231 GNC          14 :             Assert(sortKey->ssup_attno != 0);
    1232                 : 
    1233 CBC          14 :             strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
    1234                 :                 BTGreaterStrategyNumber : BTLessStrategyNumber;
    1235 ECB             : 
    1236 GIC          14 :             PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
    1237                 :         }
    1238 ECB             : 
    1239                 :         for (;;)
    1240                 :         {
    1241 GIC        1626 :             load1 = true;       /* load BTSpool next ? */
    1242            1626 :             if (itup2 == NULL)
    1243 ECB             :             {
    1244 CBC          75 :                 if (itup == NULL)
    1245 GIC          13 :                     break;
    1246 ECB             :             }
    1247 CBC        1551 :             else if (itup != NULL)
    1248                 :             {
    1249            1460 :                 int32       compare = 0;
    1250                 : 
    1251            1583 :                 for (i = 1; i <= keysz; i++)
    1252                 :                 {
    1253 ECB             :                     SortSupport entry;
    1254                 :                     Datum       attrDatum1,
    1255                 :                                 attrDatum2;
    1256                 :                     bool        isNull1,
    1257                 :                                 isNull2;
    1258                 : 
    1259 GIC        1488 :                     entry = sortKeys + i - 1;
    1260            1488 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1261 CBC        1488 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1262 ECB             : 
    1263 CBC        1488 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1264                 :                                                   attrDatum2, isNull2,
    1265 ECB             :                                                   entry);
    1266 GIC        1488 :                     if (compare > 0)
    1267                 :                     {
    1268 CBC         114 :                         load1 = false;
    1269 GIC        1365 :                         break;
    1270 ECB             :                     }
    1271 CBC        1374 :                     else if (compare < 0)
    1272 GIC        1251 :                         break;
    1273 ECB             :                 }
    1274                 : 
    1275                 :                 /*
    1276                 :                  * If key values are equal, we sort on ItemPointer.  This is
    1277                 :                  * required for btree indexes, since heap TID is treated as an
    1278                 :                  * implicit last key attribute in order to ensure that all
    1279                 :                  * keys in the index are physically unique.
    1280                 :                  */
    1281 GIC        1460 :                 if (compare == 0)
    1282                 :                 {
    1283 CBC          95 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1284 GIC          95 :                     Assert(compare != 0);
    1285 CBC          95 :                     if (compare > 0)
    1286              20 :                         load1 = false;
    1287 ECB             :                 }
    1288                 :             }
    1289                 :             else
    1290 GIC          91 :                 load1 = false;
    1291                 : 
    1292 ECB             :             /* When we see first tuple, create first index page */
    1293 GIC        1613 :             if (state == NULL)
    1294              13 :                 state = _bt_pagestate(wstate, 0);
    1295 ECB             : 
    1296 CBC        1613 :             if (load1)
    1297                 :             {
    1298            1388 :                 _bt_buildadd(wstate, state, itup, 0);
    1299 GIC        1388 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1300 ECB             :             }
    1301                 :             else
    1302                 :             {
    1303 GIC         225 :                 _bt_buildadd(wstate, state, itup2, 0);
    1304             225 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1305 ECB             :             }
    1306                 : 
    1307                 :             /* Report progress */
    1308 GIC        1613 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1309                 :                                          ++tuples_done);
    1310 ECB             :         }
    1311 GIC          13 :         pfree(sortKeys);
    1312                 :     }
    1313 CBC       64100 :     else if (deduplicate)
    1314                 :     {
    1315 ECB             :         /* merge is unnecessary, deduplicate into posting lists */
    1316                 :         BTDedupState dstate;
    1317                 : 
    1318 GIC        7437 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1319            7437 :         dstate->deduplicate = true; /* unused */
    1320 CBC        7437 :         dstate->nmaxitems = 0;   /* unused */
    1321            7437 :         dstate->maxpostingsize = 0; /* set later */
    1322 ECB             :         /* Metadata about base tuple of current pending posting list */
    1323 CBC        7437 :         dstate->base = NULL;
    1324 GIC        7437 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1325 CBC        7437 :         dstate->basetupsize = 0;
    1326 ECB             :         /* Metadata about current pending posting list TIDs */
    1327 CBC        7437 :         dstate->htids = NULL;
    1328 GIC        7437 :         dstate->nhtids = 0;
    1329 CBC        7437 :         dstate->nitems = 0;
    1330            7437 :         dstate->phystupsize = 0; /* unused */
    1331            7437 :         dstate->nintervals = 0; /* unused */
    1332 ECB             : 
    1333 CBC     3072212 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1334 GIC     3072212 :                                                true)) != NULL)
    1335 ECB             :         {
    1336                 :             /* When we see first tuple, create first index page */
    1337 GIC     3064775 :             if (state == NULL)
    1338                 :             {
    1339 CBC        1563 :                 state = _bt_pagestate(wstate, 0);
    1340                 : 
    1341 ECB             :                 /*
    1342                 :                  * Limit size of posting list tuples to 1/10 space we want to
    1343                 :                  * leave behind on the page, plus space for final item's line
    1344                 :                  * pointer.  This is equal to the space that we'd like to
    1345                 :                  * leave behind on each leaf page when fillfactor is 90,
    1346                 :                  * allowing us to get close to fillfactor% space utilization
    1347                 :                  * when there happen to be a great many duplicates.  (This
    1348                 :                  * makes higher leaf fillfactor settings ineffective when
    1349                 :                  * building indexes that have many duplicates, but packing
    1350                 :                  * leaf pages full with few very large tuples doesn't seem
    1351                 :                  * like a useful goal.)
    1352                 :                  */
    1353 GIC        1563 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1354                 :                     sizeof(ItemIdData);
    1355 CBC        1563 :                 Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
    1356                 :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1357            1563 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1358                 : 
    1359 ECB             :                 /* start new pending posting list with itup copy */
    1360 GIC        1563 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1361                 :                                         InvalidOffsetNumber);
    1362 ECB             :             }
    1363 GIC     3063212 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1364          666407 :                                          itup) > keysz &&
    1365 CBC      666407 :                      _bt_dedup_save_htid(dstate, itup))
    1366 ECB             :             {
    1367                 :                 /*
    1368                 :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1369                 :                  * TID from itup has been saved in state.
    1370                 :                  */
    1371                 :             }
    1372                 :             else
    1373                 :             {
    1374                 :                 /*
    1375                 :                  * Tuple is not equal to pending posting list tuple, or
    1376                 :                  * _bt_dedup_save_htid() opted to not merge current item into
    1377                 :                  * pending posting list.
    1378                 :                  */
    1379 GIC     2400548 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1380         2400548 :                 pfree(dstate->base);
    1381 ECB             : 
    1382                 :                 /* start new pending posting list with itup copy */
    1383 GIC     2400548 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1384                 :                                         InvalidOffsetNumber);
    1385 ECB             :             }
    1386                 : 
    1387                 :             /* Report progress */
    1388 GIC     3064775 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1389                 :                                          ++tuples_done);
    1390 ECB             :         }
    1391                 : 
    1392 GIC        7437 :         if (state)
    1393                 :         {
    1394 ECB             :             /*
    1395                 :              * Handle the last item (there must be a last item when the
    1396                 :              * tuplesort returned one or more tuples)
    1397                 :              */
    1398 GIC        1563 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1399            1563 :             pfree(dstate->base);
    1400 CBC        1563 :             pfree(dstate->htids);
    1401 ECB             :         }
    1402                 : 
    1403 GIC        7437 :         pfree(dstate);
    1404                 :     }
    1405 ECB             :     else
    1406                 :     {
    1407                 :         /* merging and deduplication are both unnecessary */
    1408 GIC     8759792 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1409         8759792 :                                                true)) != NULL)
    1410 ECB             :         {
    1411                 :             /* When we see first tuple, create first index page */
    1412 GIC     8703129 :             if (state == NULL)
    1413           16925 :                 state = _bt_pagestate(wstate, 0);
    1414 ECB             : 
    1415 CBC     8703129 :             _bt_buildadd(wstate, state, itup, 0);
    1416                 : 
    1417 ECB             :             /* Report progress */
    1418 GIC     8703129 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1419                 :                                          ++tuples_done);
    1420 ECB             :         }
    1421                 :     }
    1422                 : 
    1423                 :     /* Close down final pages and write the metapage */
    1424 GIC       64113 :     _bt_uppershutdown(wstate, state);
    1425                 : 
    1426 ECB             :     /*
    1427                 :      * When we WAL-logged index pages, we must nonetheless fsync index files.
    1428                 :      * Since we're building outside shared buffers, a CHECKPOINT occurring
    1429                 :      * during the build has no way to flush the previously written data to
    1430                 :      * disk (indeed it won't know the index even exists).  A crash later on
    1431                 :      * would replay WAL from the checkpoint, therefore it wouldn't replay our
    1432                 :      * earlier WAL entries. If we do not fsync those pages here, they might
    1433                 :      * still not be on disk when the crash occurs.
    1434                 :      */
    1435 GIC       64113 :     if (wstate->btws_use_wal)
    1436           59235 :         smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
    1437 CBC       64113 : }
    1438 ECB             : 
    1439                 : /*
    1440                 :  * Create parallel context, and launch workers for leader.
    1441                 :  *
    1442                 :  * buildstate argument should be initialized (with the exception of the
    1443                 :  * tuplesort state in spools, which may later be created based on shared
    1444                 :  * state initially set up here).
    1445                 :  *
    1446                 :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1447                 :  *
    1448                 :  * request is the target number of parallel worker processes to launch.
    1449                 :  *
    1450                 :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1451                 :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1452                 :  * build.  If not even a single worker process can be launched, this is
    1453                 :  * never set, and caller should proceed with a serial index build.
    1454                 :  */
    1455                 : static void
    1456 GIC          71 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1457                 : {
    1458 ECB             :     ParallelContext *pcxt;
    1459                 :     int         scantuplesortstates;
    1460                 :     Snapshot    snapshot;
    1461                 :     Size        estbtshared;
    1462                 :     Size        estsort;
    1463                 :     BTShared   *btshared;
    1464                 :     Sharedsort *sharedsort;
    1465                 :     Sharedsort *sharedsort2;
    1466 GIC          71 :     BTSpool    *btspool = buildstate->spool;
    1467              71 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1468 ECB             :     WalUsage   *walusage;
    1469                 :     BufferUsage *bufferusage;
    1470 GIC          71 :     bool        leaderparticipates = true;
    1471                 :     int         querylen;
    1472 ECB             : 
    1473                 : #ifdef DISABLE_LEADER_PARTICIPATION
    1474                 :     leaderparticipates = false;
    1475                 : #endif
    1476                 : 
    1477                 :     /*
    1478                 :      * Enter parallel mode, and create context for parallel build of btree
    1479                 :      * index
    1480                 :      */
    1481 GIC          71 :     EnterParallelMode();
    1482              71 :     Assert(request > 0);
    1483 CBC          71 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1484 ECB             :                                  request);
    1485                 : 
    1486 GIC          71 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1487                 : 
    1488 ECB             :     /*
    1489                 :      * Prepare for scan of the base relation.  In a normal index build, we use
    1490                 :      * SnapshotAny because we must retrieve all tuples and do our own time
    1491                 :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1492                 :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1493                 :      * live according to that.
    1494                 :      */
    1495 GIC          71 :     if (!isconcurrent)
    1496              71 :         snapshot = SnapshotAny;
    1497 ECB             :     else
    1498 LBC           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1499                 : 
    1500 EUB             :     /*
    1501                 :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1502                 :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1503                 :      */
    1504 GIC          71 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1505              71 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1506 CBC          71 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1507              71 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1508 ECB             : 
    1509                 :     /*
    1510                 :      * Unique case requires a second spool, and so we may have to account for
    1511                 :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1512                 :      */
    1513 GIC          71 :     if (!btspool->isunique)
    1514              39 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1515 ECB             :     else
    1516                 :     {
    1517 GIC          32 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1518              32 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1519 ECB             :     }
    1520                 : 
    1521                 :     /*
    1522                 :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1523                 :      * and PARALLEL_KEY_BUFFER_USAGE.
    1524                 :      *
    1525                 :      * If there are no extensions loaded that care, we could skip this.  We
    1526                 :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1527                 :      * pgBufferUsage, so do it unconditionally.
    1528                 :      */
    1529 GIC          71 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1530                 :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1531 CBC          71 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1532 GIC          71 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1533 ECB             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1534 CBC          71 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1535                 : 
    1536 ECB             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1537 GIC          71 :     if (debug_query_string)
    1538                 :     {
    1539 CBC          71 :         querylen = strlen(debug_query_string);
    1540 GIC          71 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1541 CBC          71 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1542 ECB             :     }
    1543                 :     else
    1544 UIC           0 :         querylen = 0;           /* keep compiler quiet */
    1545                 : 
    1546 EUB             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1547 GIC          71 :     InitializeParallelDSM(pcxt);
    1548                 : 
    1549 ECB             :     /* If no DSM segment was available, back out (do serial build) */
    1550 GIC          71 :     if (pcxt->seg == NULL)
    1551                 :     {
    1552 LBC           0 :         if (IsMVCCSnapshot(snapshot))
    1553 UIC           0 :             UnregisterSnapshot(snapshot);
    1554 UBC           0 :         DestroyParallelContext(pcxt);
    1555               0 :         ExitParallelMode();
    1556               0 :         return;
    1557 EUB             :     }
    1558                 : 
    1559                 :     /* Store shared build state, for which we reserved space */
    1560 GIC          71 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1561                 :     /* Initialize immutable state */
    1562 CBC          71 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1563 GIC          71 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1564 CBC          71 :     btshared->isunique = btspool->isunique;
    1565              71 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1566              71 :     btshared->isconcurrent = isconcurrent;
    1567              71 :     btshared->scantuplesortstates = scantuplesortstates;
    1568              71 :     ConditionVariableInit(&btshared->workersdonecv);
    1569              71 :     SpinLockInit(&btshared->mutex);
    1570 ECB             :     /* Initialize mutable state */
    1571 CBC          71 :     btshared->nparticipantsdone = 0;
    1572 GIC          71 :     btshared->reltuples = 0.0;
    1573 CBC          71 :     btshared->havedead = false;
    1574              71 :     btshared->indtuples = 0.0;
    1575              71 :     btshared->brokenhotchain = false;
    1576              71 :     table_parallelscan_initialize(btspool->heap,
    1577 ECB             :                                   ParallelTableScanFromBTShared(btshared),
    1578                 :                                   snapshot);
    1579                 : 
    1580                 :     /*
    1581                 :      * Store shared tuplesort-private state, for which we reserved space.
    1582                 :      * Then, initialize opaque state using tuplesort routine.
    1583                 :      */
    1584 GIC          71 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1585              71 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1586 ECB             :                                 pcxt->seg);
    1587                 : 
    1588 GIC          71 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1589              71 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1590 ECB             : 
    1591                 :     /* Unique case requires a second spool, and associated shared state */
    1592 GIC          71 :     if (!btspool->isunique)
    1593              39 :         sharedsort2 = NULL;
    1594 ECB             :     else
    1595                 :     {
    1596                 :         /*
    1597                 :          * Store additional shared tuplesort-private state, for which we
    1598                 :          * reserved space.  Then, initialize opaque state using tuplesort
    1599                 :          * routine.
    1600                 :          */
    1601 GIC          32 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1602              32 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1603 ECB             :                                     pcxt->seg);
    1604                 : 
    1605 GIC          32 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1606                 :     }
    1607 ECB             : 
    1608                 :     /* Store query string for workers */
    1609 GIC          71 :     if (debug_query_string)
    1610                 :     {
    1611 ECB             :         char       *sharedquery;
    1612                 : 
    1613 GIC          71 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1614              71 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1615 CBC          71 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1616 ECB             :     }
    1617                 : 
    1618                 :     /*
    1619                 :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1620                 :      * initialize.
    1621                 :      */
    1622 GIC          71 :     walusage = shm_toc_allocate(pcxt->toc,
    1623              71 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1624 CBC          71 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1625              71 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1626              71 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1627              71 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1628 ECB             : 
    1629                 :     /* Launch workers, saving status for leader/caller */
    1630 GIC          71 :     LaunchParallelWorkers(pcxt);
    1631              71 :     btleader->pcxt = pcxt;
    1632 CBC          71 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1633              71 :     if (leaderparticipates)
    1634              71 :         btleader->nparticipanttuplesorts++;
    1635              71 :     btleader->btshared = btshared;
    1636              71 :     btleader->sharedsort = sharedsort;
    1637              71 :     btleader->sharedsort2 = sharedsort2;
    1638              71 :     btleader->snapshot = snapshot;
    1639              71 :     btleader->walusage = walusage;
    1640              71 :     btleader->bufferusage = bufferusage;
    1641 ECB             : 
    1642                 :     /* If no workers were successfully launched, back out (do serial build) */
    1643 GIC          71 :     if (pcxt->nworkers_launched == 0)
    1644                 :     {
    1645 LBC           0 :         _bt_end_parallel(btleader);
    1646 UIC           0 :         return;
    1647 EUB             :     }
    1648                 : 
    1649                 :     /* Save leader state now that it's clear build will be parallel */
    1650 GIC          71 :     buildstate->btleader = btleader;
    1651                 : 
    1652 ECB             :     /* Join heap scan ourselves */
    1653 GIC          71 :     if (leaderparticipates)
    1654              71 :         _bt_leader_participate_as_worker(buildstate);
    1655 ECB             : 
    1656                 :     /*
    1657                 :      * Caller needs to wait for all launched workers when we return.  Make
    1658                 :      * sure that the failure-to-start case will not hang forever.
    1659                 :      */
    1660 GIC          71 :     WaitForParallelWorkersToAttach(pcxt);
    1661                 : }
    1662 ECB             : 
    1663                 : /*
    1664                 :  * Shut down workers, destroy parallel context, and end parallel mode.
    1665                 :  */
    1666                 : static void
    1667 GIC          71 : _bt_end_parallel(BTLeader *btleader)
    1668                 : {
    1669 ECB             :     int         i;
    1670                 : 
    1671                 :     /* Shutdown worker processes */
    1672 GIC          71 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1673                 : 
    1674 ECB             :     /*
    1675                 :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1676                 :      * or we might get incomplete data.)
    1677                 :      */
    1678 GIC         142 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1679              71 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1680 ECB             : 
    1681                 :     /* Free last reference to MVCC snapshot, if one was used */
    1682 GIC          71 :     if (IsMVCCSnapshot(btleader->snapshot))
    1683 UIC           0 :         UnregisterSnapshot(btleader->snapshot);
    1684 CBC          71 :     DestroyParallelContext(btleader->pcxt);
    1685 GBC          71 :     ExitParallelMode();
    1686 CBC          71 : }
    1687 ECB             : 
    1688                 : /*
    1689                 :  * Returns size of shared memory required to store state for a parallel
    1690                 :  * btree index build based on the snapshot its parallel scan will use.
    1691                 :  */
    1692                 : static Size
    1693 GIC          71 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1694                 : {
    1695 ECB             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1696 GIC          71 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1697                 :                     table_parallelscan_estimate(heap, snapshot));
    1698 ECB             : }
    1699                 : 
    1700                 : /*
    1701                 :  * Within leader, wait for end of heap scan.
    1702                 :  *
    1703                 :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1704                 :  * already be underway within worker processes (when leader participates
    1705                 :  * as a worker, we should end up here just as workers are finishing).
    1706                 :  *
    1707                 :  * Fills in fields needed for ambuild statistics, and lets caller set
    1708                 :  * field indicating that some worker encountered a broken HOT chain.
    1709                 :  *
    1710                 :  * Returns the total number of heap tuples scanned.
    1711                 :  */
    1712                 : static double
    1713 GIC          71 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1714                 : {
    1715 CBC          71 :     BTShared   *btshared = buildstate->btleader->btshared;
    1716                 :     int         nparticipanttuplesorts;
    1717 ECB             :     double      reltuples;
    1718                 : 
    1719 GIC          71 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1720                 :     for (;;)
    1721 ECB             :     {
    1722 GIC         185 :         SpinLockAcquire(&btshared->mutex);
    1723             185 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1724 ECB             :         {
    1725 CBC          71 :             buildstate->havedead = btshared->havedead;
    1726 GIC          71 :             buildstate->indtuples = btshared->indtuples;
    1727 CBC          71 :             *brokenhotchain = btshared->brokenhotchain;
    1728              71 :             reltuples = btshared->reltuples;
    1729              71 :             SpinLockRelease(&btshared->mutex);
    1730              71 :             break;
    1731 ECB             :         }
    1732 CBC         114 :         SpinLockRelease(&btshared->mutex);
    1733                 : 
    1734             114 :         ConditionVariableSleep(&btshared->workersdonecv,
    1735                 :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1736 ECB             :     }
    1737                 : 
    1738 GIC          71 :     ConditionVariableCancelSleep();
    1739                 : 
    1740 CBC          71 :     return reltuples;
    1741                 : }
    1742 ECB             : 
    1743                 : /*
    1744                 :  * Within leader, participate as a parallel worker.
    1745                 :  */
    1746                 : static void
    1747 GIC          71 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1748                 : {
    1749 CBC          71 :     BTLeader   *btleader = buildstate->btleader;
    1750                 :     BTSpool    *leaderworker;
    1751 ECB             :     BTSpool    *leaderworker2;
    1752                 :     int         sortmem;
    1753                 : 
    1754                 :     /* Allocate memory and initialize private spool */
    1755 GIC          71 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1756              71 :     leaderworker->heap = buildstate->spool->heap;
    1757 CBC          71 :     leaderworker->index = buildstate->spool->index;
    1758              71 :     leaderworker->isunique = buildstate->spool->isunique;
    1759              71 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1760 ECB             : 
    1761                 :     /* Initialize second spool, if required */
    1762 GIC          71 :     if (!btleader->btshared->isunique)
    1763              39 :         leaderworker2 = NULL;
    1764 ECB             :     else
    1765                 :     {
    1766                 :         /* Allocate memory for worker's own private secondary spool */
    1767 GIC          32 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1768                 : 
    1769 ECB             :         /* Initialize worker's own secondary spool */
    1770 GIC          32 :         leaderworker2->heap = leaderworker->heap;
    1771              32 :         leaderworker2->index = leaderworker->index;
    1772 CBC          32 :         leaderworker2->isunique = false;
    1773 ECB             :     }
    1774                 : 
    1775                 :     /*
    1776                 :      * Might as well use reliable figure when doling out maintenance_work_mem
    1777                 :      * (when requested number of workers were not launched, this will be
    1778                 :      * somewhat higher than it is for other workers).
    1779                 :      */
    1780 GIC          71 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1781                 : 
    1782 ECB             :     /* Perform work common to all participants */
    1783 GIC          71 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1784                 :                                btleader->sharedsort, btleader->sharedsort2,
    1785 ECB             :                                sortmem, true);
    1786                 : 
    1787                 : #ifdef BTREE_BUILD_STATS
    1788                 :     if (log_btree_build_stats)
    1789                 :     {
    1790                 :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1791                 :         ResetUsage();
    1792                 :     }
    1793                 : #endif                          /* BTREE_BUILD_STATS */
    1794 GIC          71 : }
    1795                 : 
    1796 ECB             : /*
    1797                 :  * Perform work within a launched parallel process.
    1798                 :  */
    1799                 : void
    1800 GIC          71 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1801                 : {
    1802 ECB             :     char       *sharedquery;
    1803                 :     BTSpool    *btspool;
    1804                 :     BTSpool    *btspool2;
    1805                 :     BTShared   *btshared;
    1806                 :     Sharedsort *sharedsort;
    1807                 :     Sharedsort *sharedsort2;
    1808                 :     Relation    heapRel;
    1809                 :     Relation    indexRel;
    1810                 :     LOCKMODE    heapLockmode;
    1811                 :     LOCKMODE    indexLockmode;
    1812                 :     WalUsage   *walusage;
    1813                 :     BufferUsage *bufferusage;
    1814                 :     int         sortmem;
    1815                 : 
    1816                 : #ifdef BTREE_BUILD_STATS
    1817                 :     if (log_btree_build_stats)
    1818                 :         ResetUsage();
    1819                 : #endif                          /* BTREE_BUILD_STATS */
    1820                 : 
    1821                 :     /*
    1822                 :      * The only possible status flag that can be set to the parallel worker is
    1823                 :      * PROC_IN_SAFE_IC.
    1824                 :      */
    1825 GIC          71 :     Assert((MyProc->statusFlags == 0) ||
    1826                 :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1827 ECB             : 
    1828                 :     /* Set debug_query_string for individual workers first */
    1829 GIC          71 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1830              71 :     debug_query_string = sharedquery;
    1831 ECB             : 
    1832                 :     /* Report the query string from leader */
    1833 GIC          71 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1834                 : 
    1835 ECB             :     /* Look up nbtree shared state */
    1836 GIC          71 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1837                 : 
    1838 ECB             :     /* Open relations using lock modes known to be obtained by index.c */
    1839 GIC          71 :     if (!btshared->isconcurrent)
    1840                 :     {
    1841 CBC          71 :         heapLockmode = ShareLock;
    1842 GIC          71 :         indexLockmode = AccessExclusiveLock;
    1843 ECB             :     }
    1844                 :     else
    1845                 :     {
    1846 UIC           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1847               0 :         indexLockmode = RowExclusiveLock;
    1848 EUB             :     }
    1849                 : 
    1850                 :     /* Open relations within worker */
    1851 GIC          71 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1852              71 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1853 ECB             : 
    1854                 :     /* Initialize worker's own spool */
    1855 GIC          71 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1856              71 :     btspool->heap = heapRel;
    1857 CBC          71 :     btspool->index = indexRel;
    1858              71 :     btspool->isunique = btshared->isunique;
    1859              71 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1860 ECB             : 
    1861                 :     /* Look up shared state private to tuplesort.c */
    1862 GIC          71 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1863              71 :     tuplesort_attach_shared(sharedsort, seg);
    1864 CBC          71 :     if (!btshared->isunique)
    1865 ECB             :     {
    1866 CBC          39 :         btspool2 = NULL;
    1867 GIC          39 :         sharedsort2 = NULL;
    1868 ECB             :     }
    1869                 :     else
    1870                 :     {
    1871                 :         /* Allocate memory for worker's own private secondary spool */
    1872 GIC          32 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1873                 : 
    1874 ECB             :         /* Initialize worker's own secondary spool */
    1875 GIC          32 :         btspool2->heap = btspool->heap;
    1876              32 :         btspool2->index = btspool->index;
    1877 CBC          32 :         btspool2->isunique = false;
    1878 ECB             :         /* Look up shared state private to tuplesort.c */
    1879 CBC          32 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1880 GIC          32 :         tuplesort_attach_shared(sharedsort2, seg);
    1881 ECB             :     }
    1882                 : 
    1883                 :     /* Prepare to track buffer usage during parallel execution */
    1884 GIC          71 :     InstrStartParallelQuery();
    1885                 : 
    1886 ECB             :     /* Perform sorting of spool, and possibly a spool2 */
    1887 GIC          71 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1888              71 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1889 ECB             :                                sharedsort2, sortmem, false);
    1890                 : 
    1891                 :     /* Report WAL/buffer usage during parallel execution */
    1892 GIC          71 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1893              71 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1894 CBC          71 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1895              71 :                           &walusage[ParallelWorkerNumber]);
    1896 ECB             : 
    1897                 : #ifdef BTREE_BUILD_STATS
    1898                 :     if (log_btree_build_stats)
    1899                 :     {
    1900                 :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1901                 :         ResetUsage();
    1902                 :     }
    1903                 : #endif                          /* BTREE_BUILD_STATS */
    1904                 : 
    1905 GIC          71 :     index_close(indexRel, indexLockmode);
    1906              71 :     table_close(heapRel, heapLockmode);
    1907 CBC          71 : }
    1908 ECB             : 
    1909                 : /*
    1910                 :  * Perform a worker's portion of a parallel sort.
    1911                 :  *
    1912                 :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1913                 :  * state if a second btspool is need (i.e. for unique index builds).  All
    1914                 :  * other spool fields should already be set when this is called.
    1915                 :  *
    1916                 :  * sortmem is the amount of working memory to use within each worker,
    1917                 :  * expressed in KBs.
    1918                 :  *
    1919                 :  * When this returns, workers are done, and need only release resources.
    1920                 :  */
    1921                 : static void
    1922 GIC         142 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1923                 :                            BTShared *btshared, Sharedsort *sharedsort,
    1924 ECB             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1925                 : {
    1926                 :     SortCoordinate coordinate;
    1927                 :     BTBuildState buildstate;
    1928                 :     TableScanDesc scan;
    1929                 :     double      reltuples;
    1930                 :     IndexInfo  *indexInfo;
    1931                 : 
    1932                 :     /* Initialize local tuplesort coordination state */
    1933 GIC         142 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1934             142 :     coordinate->isWorker = true;
    1935 CBC         142 :     coordinate->nParticipants = -1;
    1936             142 :     coordinate->sharedsort = sharedsort;
    1937 ECB             : 
    1938                 :     /* Begin "partial" tuplesort */
    1939 GIC         284 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1940                 :                                                      btspool->index,
    1941 CBC         142 :                                                      btspool->isunique,
    1942 GIC         142 :                                                      btspool->nulls_not_distinct,
    1943 ECB             :                                                      sortmem, coordinate,
    1944                 :                                                      TUPLESORT_NONE);
    1945                 : 
    1946                 :     /*
    1947                 :      * Just as with serial case, there may be a second spool.  If so, a
    1948                 :      * second, dedicated spool2 partial tuplesort is required.
    1949                 :      */
    1950 GIC         142 :     if (btspool2)
    1951                 :     {
    1952 ECB             :         SortCoordinate coordinate2;
    1953                 : 
    1954                 :         /*
    1955                 :          * We expect that the second one (for dead tuples) won't get very
    1956                 :          * full, so we give it only work_mem (unless sortmem is less for
    1957                 :          * worker).  Worker processes are generally permitted to allocate
    1958                 :          * work_mem independently.
    1959                 :          */
    1960 GIC          64 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1961              64 :         coordinate2->isWorker = true;
    1962 CBC          64 :         coordinate2->nParticipants = -1;
    1963              64 :         coordinate2->sharedsort = sharedsort2;
    1964              64 :         btspool2->sortstate =
    1965              64 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1966 ECB             :                                         Min(sortmem, work_mem), coordinate2,
    1967                 :                                         false);
    1968                 :     }
    1969                 : 
    1970                 :     /* Fill in buildstate for _bt_build_callback() */
    1971 GIC         142 :     buildstate.isunique = btshared->isunique;
    1972             142 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1973 CBC         142 :     buildstate.havedead = false;
    1974             142 :     buildstate.heap = btspool->heap;
    1975             142 :     buildstate.spool = btspool;
    1976             142 :     buildstate.spool2 = btspool2;
    1977             142 :     buildstate.indtuples = 0;
    1978             142 :     buildstate.btleader = NULL;
    1979 ECB             : 
    1980                 :     /* Join parallel scan */
    1981 GIC         142 :     indexInfo = BuildIndexInfo(btspool->index);
    1982             142 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1983 CBC         142 :     scan = table_beginscan_parallel(btspool->heap,
    1984 ECB             :                                     ParallelTableScanFromBTShared(btshared));
    1985 CBC         142 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1986                 :                                        true, progress, _bt_build_callback,
    1987 ECB             :                                        (void *) &buildstate, scan);
    1988                 : 
    1989                 :     /* Execute this worker's part of the sort */
    1990 GIC         142 :     if (progress)
    1991              71 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1992 ECB             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1993 CBC         142 :     tuplesort_performsort(btspool->sortstate);
    1994 GIC         142 :     if (btspool2)
    1995 ECB             :     {
    1996 CBC          64 :         if (progress)
    1997 GIC          32 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1998 ECB             :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1999 CBC          64 :         tuplesort_performsort(btspool2->sortstate);
    2000                 :     }
    2001 ECB             : 
    2002                 :     /*
    2003                 :      * Done.  Record ambuild statistics, and whether we encountered a broken
    2004                 :      * HOT chain.
    2005                 :      */
    2006 GIC         142 :     SpinLockAcquire(&btshared->mutex);
    2007             142 :     btshared->nparticipantsdone++;
    2008 CBC         142 :     btshared->reltuples += reltuples;
    2009             142 :     if (buildstate.havedead)
    2010 LBC           0 :         btshared->havedead = true;
    2011 CBC         142 :     btshared->indtuples += buildstate.indtuples;
    2012 GBC         142 :     if (indexInfo->ii_BrokenHotChain)
    2013 LBC           0 :         btshared->brokenhotchain = true;
    2014 CBC         142 :     SpinLockRelease(&btshared->mutex);
    2015 EUB             : 
    2016 ECB             :     /* Notify leader */
    2017 GIC         142 :     ConditionVariableSignal(&btshared->workersdonecv);
    2018                 : 
    2019 ECB             :     /* We can end tuplesorts immediately */
    2020 GIC         142 :     tuplesort_end(btspool->sortstate);
    2021             142 :     if (btspool2)
    2022 CBC          64 :         tuplesort_end(btspool2->sortstate);
    2023             142 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a