Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gistbuild.c
4 : * build algorithm for GiST indexes implementation.
5 : *
6 : * There are two different strategies:
7 : *
8 : * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9 : * order, and create downlinks and internal pages as we go. This builds
10 : * the index from the bottom up, similar to how B-tree index build
11 : * works.
12 : *
13 : * 2. Start with an empty index, and insert all tuples one by one.
14 : *
15 : * The sorted method is used if the operator classes for all columns have
16 : * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17 : *
18 : * The second strategy can optionally use buffers at different levels of
19 : * the tree to reduce I/O, see "Buffering build algorithm" in the README
20 : * for a more detailed explanation. It initially calls insert over and
21 : * over, but switches to the buffered algorithm after a certain number of
22 : * tuples (unless buffering mode is disabled).
23 : *
24 : *
25 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
26 : * Portions Copyright (c) 1994, Regents of the University of California
27 : *
28 : * IDENTIFICATION
29 : * src/backend/access/gist/gistbuild.c
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include <math.h>
36 :
37 : #include "access/genam.h"
38 : #include "access/gist_private.h"
39 : #include "access/gistxlog.h"
40 : #include "access/tableam.h"
41 : #include "access/xloginsert.h"
42 : #include "catalog/index.h"
43 : #include "miscadmin.h"
44 : #include "optimizer/optimizer.h"
45 : #include "storage/bufmgr.h"
46 : #include "storage/smgr.h"
47 : #include "utils/memutils.h"
48 : #include "utils/rel.h"
49 : #include "utils/tuplesort.h"
50 :
51 : /* Step of index tuples for check whether to switch to buffering build mode */
52 : #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53 :
54 : /*
55 : * Number of tuples to process in the slow way before switching to buffering
56 : * mode, when buffering is explicitly turned on. Also, the number of tuples
57 : * to process between readjusting the buffer size parameter, while in
58 : * buffering mode.
59 : */
60 : #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61 :
62 : /*
63 : * Strategy used to build the index. It can change between the
64 : * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65 : * that needs to be decided up-front and cannot be changed afterwards.
66 : */
67 : typedef enum
68 : {
69 : GIST_SORTED_BUILD, /* bottom-up build by sorting */
70 : GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
71 : * switch */
72 : GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
73 : * buffering build mode if the index grows too
74 : * big */
75 : GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
76 : * before switching to the buffering build
77 : * mode */
78 : GIST_BUFFERING_ACTIVE /* in buffering build mode */
79 : } GistBuildMode;
80 :
81 : /* Working state for gistbuild and its callback */
82 : typedef struct
83 : {
84 : Relation indexrel;
85 : Relation heaprel;
86 : GISTSTATE *giststate;
87 :
88 : Size freespace; /* amount of free space to leave on pages */
89 :
90 : GistBuildMode buildMode;
91 :
92 : int64 indtuples; /* number of tuples indexed */
93 :
94 : /*
95 : * Extra data structures used during a buffering build. 'gfbb' contains
96 : * information related to managing the build buffers. 'parentMap' is a
97 : * lookup table of the parent of each internal page.
98 : */
99 : int64 indtuplesSize; /* total size of all indexed tuples */
100 : GISTBuildBuffers *gfbb;
101 : HTAB *parentMap;
102 :
103 : /*
104 : * Extra data structures used during a sorting build.
105 : */
106 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
107 :
108 : BlockNumber pages_allocated;
109 : BlockNumber pages_written;
110 :
111 : int ready_num_pages;
112 : BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
113 : Page ready_pages[XLR_MAX_BLOCK_ID];
114 : } GISTBuildState;
115 :
116 : #define GIST_SORTED_BUILD_PAGE_NUM 4
117 :
118 : /*
119 : * In sorted build, we use a stack of these structs, one for each level,
120 : * to hold an in-memory buffer of last pages at the level.
121 : *
122 : * Sorting GiST build requires good linearization of the sort opclass. This is
123 : * not always the case in multidimensional data. To tackle the anomalies, we
124 : * buffer index tuples and apply picksplit that can be multidimension-aware.
125 : */
126 : typedef struct GistSortedBuildLevelState
127 : {
128 : int current_page;
129 : BlockNumber last_blkno;
130 : struct GistSortedBuildLevelState *parent; /* Upper level, if any */
131 : Page pages[GIST_SORTED_BUILD_PAGE_NUM];
132 : } GistSortedBuildLevelState;
133 :
134 : /* prototypes for private functions */
135 :
136 : static void gistSortedBuildCallback(Relation index, ItemPointer tid,
137 : Datum *values, bool *isnull,
138 : bool tupleIsAlive, void *state);
139 : static void gist_indexsortbuild(GISTBuildState *state);
140 : static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
141 : GistSortedBuildLevelState *levelstate,
142 : IndexTuple itup);
143 : static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
144 : GistSortedBuildLevelState *levelstate);
145 : static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
146 :
147 : static void gistInitBuffering(GISTBuildState *buildstate);
148 : static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
149 : static void gistBuildCallback(Relation index,
150 : ItemPointer tid,
151 : Datum *values,
152 : bool *isnull,
153 : bool tupleIsAlive,
154 : void *state);
155 : static void gistBufferingBuildInsert(GISTBuildState *buildstate,
156 : IndexTuple itup);
157 : static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
158 : BlockNumber startblkno, int startlevel);
159 : static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
160 : Buffer buffer, int level,
161 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
162 : BlockNumber parentblk, OffsetNumber downlinkoffnum);
163 : static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
164 : BlockNumber childblkno, int level,
165 : BlockNumber *parentblkno,
166 : OffsetNumber *downlinkoffnum);
167 : static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
168 : static void gistEmptyAllBuffers(GISTBuildState *buildstate);
169 : static int gistGetMaxLevel(Relation index);
170 :
171 : static void gistInitParentMap(GISTBuildState *buildstate);
172 : static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
173 : BlockNumber parent);
174 : static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
175 : Buffer parentbuf);
176 : static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
177 :
178 :
179 : /*
180 : * Main entry point to GiST index build.
181 : */
182 : IndexBuildResult *
2639 tgl 183 GIC 293 : gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
4231 heikki.linnakangas 184 ECB : {
185 : IndexBuildResult *result;
186 : double reltuples;
187 : GISTBuildState buildstate;
4231 heikki.linnakangas 188 GIC 293 : MemoryContext oldcxt = CurrentMemoryContext;
4231 heikki.linnakangas 189 ECB : int fillfactor;
190 : Oid SortSupportFnOids[INDEX_MAX_KEYS];
909 tgl 191 GIC 293 : GiSTOptions *options = (GiSTOptions *) index->rd_options;
934 heikki.linnakangas 192 ECB :
193 : /*
194 : * We expect to be called exactly once for any index relation. If that's
195 : * not the case, big trouble's what we have.
196 : */
934 heikki.linnakangas 197 GIC 293 : if (RelationGetNumberOfBlocks(index) != 0)
934 heikki.linnakangas 198 LBC 0 : elog(ERROR, "index \"%s\" already contains data",
934 heikki.linnakangas 199 EUB : RelationGetRelationName(index));
200 :
4231 heikki.linnakangas 201 GIC 293 : buildstate.indexrel = index;
1570 akorotkov 202 CBC 293 : buildstate.heaprel = heap;
934 heikki.linnakangas 203 293 : buildstate.sortstate = NULL;
204 293 : buildstate.giststate = initGISTstate(index);
1292 alvherre 205 ECB :
206 : /*
207 : * Create a temporary memory context that is reset once for each tuple
208 : * processed. (Note: we don't bother to make this a child of the
209 : * giststate's scanCxt, so we have to delete it separately at the end.)
210 : */
934 heikki.linnakangas 211 GIC 293 : buildstate.giststate->tempCxt = createTempGistContext();
934 heikki.linnakangas 212 ECB :
213 : /*
214 : * Choose build strategy. First check whether the user specified to use
215 : * buffering mode. (The use-case for that in the field is somewhat
216 : * questionable perhaps, but it's important for testing purposes.)
217 : */
909 tgl 218 GIC 293 : if (options)
934 heikki.linnakangas 219 ECB : {
1292 alvherre 220 GIC 16 : if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
934 heikki.linnakangas 221 CBC 6 : buildstate.buildMode = GIST_BUFFERING_STATS;
1292 alvherre 222 10 : else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
934 heikki.linnakangas 223 3 : buildstate.buildMode = GIST_BUFFERING_DISABLED;
909 tgl 224 ECB : else /* must be "auto" */
934 heikki.linnakangas 225 GIC 7 : buildstate.buildMode = GIST_BUFFERING_AUTO;
4231 heikki.linnakangas 226 ECB : }
227 : else
228 : {
934 heikki.linnakangas 229 GIC 277 : buildstate.buildMode = GIST_BUFFERING_AUTO;
4231 heikki.linnakangas 230 ECB : }
231 :
232 : /*
233 : * Unless buffering mode was forced, see if we can use sorting instead.
234 : */
909 tgl 235 GIC 293 : if (buildstate.buildMode != GIST_BUFFERING_STATS)
909 tgl 236 ECB : {
909 tgl 237 GIC 287 : bool hasallsortsupports = true;
909 tgl 238 CBC 287 : int keyscount = IndexRelationGetNumberOfKeyAttributes(index);
909 tgl 239 ECB :
909 tgl 240 GIC 359 : for (int i = 0; i < keyscount; i++)
909 tgl 241 ECB : {
909 tgl 242 GIC 290 : SortSupportFnOids[i] = index_getprocid(index, i + 1,
909 tgl 243 ECB : GIST_SORTSUPPORT_PROC);
909 tgl 244 GIC 290 : if (!OidIsValid(SortSupportFnOids[i]))
909 tgl 245 ECB : {
909 tgl 246 GIC 218 : hasallsortsupports = false;
909 tgl 247 CBC 218 : break;
909 tgl 248 ECB : }
249 : }
909 tgl 250 GIC 287 : if (hasallsortsupports)
909 tgl 251 CBC 69 : buildstate.buildMode = GIST_SORTED_BUILD;
909 tgl 252 ECB : }
253 :
254 : /*
255 : * Calculate target amount of free space to leave on pages.
256 : */
934 heikki.linnakangas 257 GIC 293 : fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
4231 heikki.linnakangas 258 CBC 293 : buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
4231 heikki.linnakangas 259 ECB :
260 : /*
261 : * Build the index using the chosen strategy.
262 : */
934 heikki.linnakangas 263 GIC 293 : buildstate.indtuples = 0;
934 heikki.linnakangas 264 CBC 293 : buildstate.indtuplesSize = 0;
4231 heikki.linnakangas 265 ECB :
934 heikki.linnakangas 266 GIC 293 : if (buildstate.buildMode == GIST_SORTED_BUILD)
934 heikki.linnakangas 267 ECB : {
268 : /*
269 : * Sort all data, build the index from bottom up.
270 : */
934 heikki.linnakangas 271 GIC 69 : buildstate.sortstate = tuplesort_begin_index_gist(heap,
934 heikki.linnakangas 272 ECB : index,
273 : maintenance_work_mem,
274 : NULL,
275 : TUPLESORT_NONE);
276 :
277 : /* Scan the table, adding all tuples to the tuplesort */
934 heikki.linnakangas 278 GIC 69 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
934 heikki.linnakangas 279 ECB : gistSortedBuildCallback,
280 : (void *) &buildstate, NULL);
281 :
282 : /*
283 : * Perform the sort and build index pages.
284 : */
934 heikki.linnakangas 285 GIC 69 : tuplesort_performsort(buildstate.sortstate);
934 heikki.linnakangas 286 ECB :
934 heikki.linnakangas 287 GIC 69 : gist_indexsortbuild(&buildstate);
934 heikki.linnakangas 288 ECB :
934 heikki.linnakangas 289 GIC 69 : tuplesort_end(buildstate.sortstate);
934 heikki.linnakangas 290 ECB : }
291 : else
292 : {
293 : /*
294 : * Initialize an empty index and insert all tuples, possibly using
295 : * buffers on intermediate levels.
296 : */
297 : Buffer buffer;
298 : Page page;
299 :
300 : /* initialize the root page */
8 andres 301 GNC 224 : buffer = gistNewBuffer(index, heap);
934 heikki.linnakangas 302 CBC 224 : Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
303 224 : page = BufferGetPage(buffer);
934 heikki.linnakangas 304 ECB :
934 heikki.linnakangas 305 GIC 224 : START_CRIT_SECTION();
934 heikki.linnakangas 306 ECB :
934 heikki.linnakangas 307 GIC 224 : GISTInitBuffer(buffer, F_LEAF);
934 heikki.linnakangas 308 ECB :
934 heikki.linnakangas 309 GIC 224 : MarkBufferDirty(buffer);
934 heikki.linnakangas 310 CBC 224 : PageSetLSN(page, GistBuildLSN);
934 heikki.linnakangas 311 ECB :
934 heikki.linnakangas 312 GIC 224 : UnlockReleaseBuffer(buffer);
934 heikki.linnakangas 313 ECB :
934 heikki.linnakangas 314 GIC 224 : END_CRIT_SECTION();
934 heikki.linnakangas 315 ECB :
316 : /* Scan the table, inserting all the tuples to the index. */
934 heikki.linnakangas 317 GIC 224 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
934 heikki.linnakangas 318 ECB : gistBuildCallback,
319 : (void *) &buildstate, NULL);
320 :
321 : /*
322 : * If buffering was used, flush out all the tuples that are still in
323 : * the buffers.
324 : */
934 heikki.linnakangas 325 GIC 224 : if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
934 heikki.linnakangas 326 ECB : {
934 heikki.linnakangas 327 GIC 3 : elog(DEBUG1, "all tuples processed, emptying buffers");
934 heikki.linnakangas 328 CBC 3 : gistEmptyAllBuffers(&buildstate);
329 3 : gistFreeBuildBuffers(buildstate.gfbb);
934 heikki.linnakangas 330 ECB : }
331 :
332 : /*
333 : * We didn't write WAL records as we built the index, so if
334 : * WAL-logging is required, write all pages to the WAL now.
335 : */
934 heikki.linnakangas 336 GIC 224 : if (RelationNeedsWAL(index))
934 heikki.linnakangas 337 ECB : {
934 heikki.linnakangas 338 GIC 132 : log_newpage_range(index, MAIN_FORKNUM,
934 heikki.linnakangas 339 ECB : 0, RelationGetNumberOfBlocks(index),
340 : true);
341 : }
342 : }
343 :
344 : /* okay, all heap tuples are indexed */
934 heikki.linnakangas 345 GIC 293 : MemoryContextSwitchTo(oldcxt);
934 heikki.linnakangas 346 CBC 293 : MemoryContextDelete(buildstate.giststate->tempCxt);
934 heikki.linnakangas 347 ECB :
934 heikki.linnakangas 348 GIC 293 : freeGISTstate(buildstate.giststate);
4209 tgl 349 ECB :
350 : /*
351 : * Return statistics
352 : */
934 heikki.linnakangas 353 GIC 293 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
4231 heikki.linnakangas 354 ECB :
934 heikki.linnakangas 355 GIC 293 : result->heap_tuples = reltuples;
934 heikki.linnakangas 356 CBC 293 : result->index_tuples = (double) buildstate.indtuples;
4231 heikki.linnakangas 357 ECB :
934 heikki.linnakangas 358 GIC 293 : return result;
934 heikki.linnakangas 359 ECB : }
360 :
361 : /*-------------------------------------------------------------------------
362 : * Routines for sorted build
363 : *-------------------------------------------------------------------------
364 : */
365 :
366 : /*
367 : * Per-tuple callback for table_index_build_scan.
368 : */
369 : static void
934 heikki.linnakangas 370 GIC 97072 : gistSortedBuildCallback(Relation index,
934 heikki.linnakangas 371 ECB : ItemPointer tid,
372 : Datum *values,
373 : bool *isnull,
374 : bool tupleIsAlive,
375 : void *state)
376 : {
934 heikki.linnakangas 377 GIC 97072 : GISTBuildState *buildstate = (GISTBuildState *) state;
934 heikki.linnakangas 378 ECB : MemoryContext oldCtx;
379 : Datum compressed_values[INDEX_MAX_KEYS];
380 :
934 heikki.linnakangas 381 GIC 97072 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
4231 heikki.linnakangas 382 ECB :
383 : /* Form an index tuple and point it at the heap tuple */
934 heikki.linnakangas 384 GIC 97072 : gistCompressValues(buildstate->giststate, index,
934 heikki.linnakangas 385 ECB : values, isnull,
386 : true, compressed_values);
387 :
934 heikki.linnakangas 388 GIC 97072 : tuplesort_putindextuplevalues(buildstate->sortstate,
934 heikki.linnakangas 389 ECB : buildstate->indexrel,
390 : tid,
391 : compressed_values, isnull);
392 :
934 heikki.linnakangas 393 GIC 97072 : MemoryContextSwitchTo(oldCtx);
934 heikki.linnakangas 394 CBC 97072 : MemoryContextReset(buildstate->giststate->tempCxt);
934 heikki.linnakangas 395 ECB :
396 : /* Update tuple count. */
934 heikki.linnakangas 397 GIC 97072 : buildstate->indtuples += 1;
934 heikki.linnakangas 398 CBC 97072 : }
934 heikki.linnakangas 399 ECB :
400 : /*
401 : * Build GiST index from bottom up from pre-sorted tuples.
402 : */
403 : static void
934 heikki.linnakangas 404 GIC 69 : gist_indexsortbuild(GISTBuildState *state)
934 heikki.linnakangas 405 ECB : {
406 : IndexTuple itup;
407 : GistSortedBuildLevelState *levelstate;
408 : Page page;
409 :
934 heikki.linnakangas 410 GIC 69 : state->pages_allocated = 0;
934 heikki.linnakangas 411 CBC 69 : state->pages_written = 0;
412 69 : state->ready_num_pages = 0;
4231 heikki.linnakangas 413 ECB :
414 : /*
415 : * Write an empty page as a placeholder for the root page. It will be
416 : * replaced with the real root page at the end.
417 : */
1 tmunro 418 GNC 69 : page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
636 tgl 419 CBC 69 : smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
934 heikki.linnakangas 420 ECB : page, true);
934 heikki.linnakangas 421 GIC 69 : state->pages_allocated++;
934 heikki.linnakangas 422 CBC 69 : state->pages_written++;
934 heikki.linnakangas 423 ECB :
424 : /* Allocate a temporary buffer for the first leaf page batch. */
426 akorotkov 425 GIC 69 : levelstate = palloc0(sizeof(GistSortedBuildLevelState));
426 akorotkov 426 CBC 69 : levelstate->pages[0] = page;
427 69 : levelstate->parent = NULL;
934 heikki.linnakangas 428 69 : gistinitpage(page, F_LEAF);
4231 heikki.linnakangas 429 ECB :
430 : /*
431 : * Fill index pages with tuples in the sorted order.
432 : */
934 heikki.linnakangas 433 GIC 97141 : while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
4231 heikki.linnakangas 434 ECB : {
426 akorotkov 435 GIC 97072 : gist_indexsortbuild_levelstate_add(state, levelstate, itup);
934 heikki.linnakangas 436 CBC 97072 : MemoryContextReset(state->giststate->tempCxt);
4231 heikki.linnakangas 437 ECB : }
438 :
439 : /*
440 : * Write out the partially full non-root pages.
441 : *
442 : * Keep in mind that flush can build a new root. If number of pages is > 1
443 : * then new root is required.
444 : */
426 akorotkov 445 GIC 82 : while (levelstate->parent != NULL || levelstate->current_page != 0)
1467 heikki.linnakangas 446 ECB : {
447 : GistSortedBuildLevelState *parent;
448 :
426 akorotkov 449 GIC 13 : gist_indexsortbuild_levelstate_flush(state, levelstate);
426 akorotkov 450 CBC 13 : parent = levelstate->parent;
451 65 : for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
452 52 : if (levelstate->pages[i])
453 52 : pfree(levelstate->pages[i]);
454 13 : pfree(levelstate);
455 13 : levelstate = parent;
1467 heikki.linnakangas 456 ECB : }
457 :
934 heikki.linnakangas 458 GIC 69 : gist_indexsortbuild_flush_ready_pages(state);
934 heikki.linnakangas 459 ECB :
460 : /* Write out the root */
426 akorotkov 461 GIC 69 : PageSetLSN(levelstate->pages[0], GistBuildLSN);
426 akorotkov 462 CBC 69 : PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
636 tgl 463 69 : smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
426 akorotkov 464 69 : levelstate->pages[0], true);
934 heikki.linnakangas 465 69 : if (RelationNeedsWAL(state->indexrel))
277 rhaas 466 GNC 53 : log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
426 akorotkov 467 ECB : levelstate->pages[0], true);
468 :
426 akorotkov 469 GIC 69 : pfree(levelstate->pages[0]);
426 akorotkov 470 CBC 69 : pfree(levelstate);
409 heikki.linnakangas 471 ECB :
472 : /*
473 : * When we WAL-logged index pages, we must nonetheless fsync index files.
474 : * Since we're building outside shared buffers, a CHECKPOINT occurring
475 : * during the build has no way to flush the previously written data to
476 : * disk (indeed it won't know the index even exists). A crash later on
477 : * would replay WAL from the checkpoint, therefore it wouldn't replay our
478 : * earlier WAL entries. If we do not fsync those pages here, they might
479 : * still not be on disk when the crash occurs.
480 : */
409 heikki.linnakangas 481 GIC 69 : if (RelationNeedsWAL(state->indexrel))
409 heikki.linnakangas 482 CBC 53 : smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
934 483 69 : }
934 heikki.linnakangas 484 ECB :
485 : /*
486 : * Add tuple to a page. If the pages are full, write them out and re-initialize
487 : * a new page first.
488 : */
489 : static void
426 akorotkov 490 GIC 97771 : gist_indexsortbuild_levelstate_add(GISTBuildState *state,
426 akorotkov 491 ECB : GistSortedBuildLevelState *levelstate,
492 : IndexTuple itup)
493 : {
494 : Size sizeNeeded;
495 :
496 : /* Check if tuple can be added to the current page */
426 akorotkov 497 GIC 97771 : sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
426 akorotkov 498 CBC 97771 : if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
426 akorotkov 499 ECB : {
500 : Page newPage;
426 akorotkov 501 GIC 521 : Page old_page = levelstate->pages[levelstate->current_page];
426 akorotkov 502 CBC 521 : uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
426 akorotkov 503 ECB :
426 akorotkov 504 GIC 521 : if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
426 akorotkov 505 ECB : {
426 akorotkov 506 GIC 127 : gist_indexsortbuild_levelstate_flush(state, levelstate);
426 akorotkov 507 ECB : }
508 : else
426 akorotkov 509 GIC 394 : levelstate->current_page++;
426 akorotkov 510 ECB :
426 akorotkov 511 GIC 521 : if (levelstate->pages[levelstate->current_page] == NULL)
1 tmunro 512 GNC 39 : levelstate->pages[levelstate->current_page] =
513 39 : palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
426 akorotkov 514 ECB :
426 akorotkov 515 CBC 521 : newPage = levelstate->pages[levelstate->current_page];
426 akorotkov 516 GIC 521 : gistinitpage(newPage, old_page_flags);
426 akorotkov 517 ECB : }
934 heikki.linnakangas 518 :
426 akorotkov 519 GIC 97771 : gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
934 heikki.linnakangas 520 97771 : }
934 heikki.linnakangas 521 ECB :
522 : static void
426 akorotkov 523 GIC 140 : gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
524 : GistSortedBuildLevelState *levelstate)
934 heikki.linnakangas 525 ECB : {
526 : GistSortedBuildLevelState *parent;
527 : BlockNumber blkno;
528 : MemoryContext oldCtx;
529 : IndexTuple union_tuple;
530 : SplitedPageLayout *dist;
531 : IndexTuple *itvec;
532 : int vect_len;
426 akorotkov 533 GIC 140 : bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
534 :
934 heikki.linnakangas 535 CBC 140 : CHECK_FOR_INTERRUPTS();
536 :
426 akorotkov 537 140 : oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
538 :
426 akorotkov 539 ECB : /* Get index tuples from first page */
426 akorotkov 540 GIC 140 : itvec = gistextractpage(levelstate->pages[0], &vect_len);
541 140 : if (levelstate->current_page > 0)
426 akorotkov 542 ECB : {
543 : /* Append tuples from each page */
426 akorotkov 544 GIC 531 : for (int i = 1; i < levelstate->current_page + 1; i++)
545 : {
426 akorotkov 546 ECB : int len_local;
426 akorotkov 547 GIC 394 : IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
548 :
426 akorotkov 549 CBC 394 : itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
426 akorotkov 550 GIC 394 : pfree(itvec_local);
426 akorotkov 551 ECB : }
552 :
553 : /* Apply picksplit to list of all collected tuples */
426 akorotkov 554 GIC 137 : dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
555 : }
426 akorotkov 556 ECB : else
557 : {
558 : /* Create splitted layout from single page */
426 akorotkov 559 GIC 3 : dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
560 3 : union_tuple = gistunion(state->indexrel, itvec, vect_len,
426 akorotkov 561 ECB : state->giststate);
426 akorotkov 562 CBC 3 : dist->itup = union_tuple;
426 akorotkov 563 GIC 3 : dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
426 akorotkov 564 CBC 3 : dist->block.num = vect_len;
426 akorotkov 565 ECB : }
4231 heikki.linnakangas 566 :
934 heikki.linnakangas 567 GIC 140 : MemoryContextSwitchTo(oldCtx);
568 :
426 akorotkov 569 ECB : /* Reset page counter */
426 akorotkov 570 GIC 140 : levelstate->current_page = 0;
571 :
426 akorotkov 572 ECB : /* Create pages for all partitions in split result */
426 akorotkov 573 GIC 839 : for (; dist != NULL; dist = dist->next)
574 : {
426 akorotkov 575 ECB : char *data;
576 : Page target;
577 :
578 : /* check once per page */
426 akorotkov 579 GIC 699 : CHECK_FOR_INTERRUPTS();
580 :
426 akorotkov 581 ECB : /* Create page and copy data */
426 akorotkov 582 GIC 699 : data = (char *) (dist->list);
1 tmunro 583 GNC 699 : target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
426 akorotkov 584 CBC 699 : gistinitpage(target, isleaf ? F_LEAF : 0);
585 97705 : for (int i = 0; i < dist->block.num; i++)
426 akorotkov 586 ECB : {
426 akorotkov 587 CBC 97006 : IndexTuple thistup = (IndexTuple) data;
588 :
589 97006 : if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
426 akorotkov 590 UIC 0 : elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
426 akorotkov 591 ECB :
426 akorotkov 592 GBC 97006 : data += IndexTupleSize(thistup);
593 : }
426 akorotkov 594 CBC 699 : union_tuple = dist->itup;
595 :
596 699 : if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
426 akorotkov 597 GIC 18 : gist_indexsortbuild_flush_ready_pages(state);
426 akorotkov 598 ECB :
599 : /*
600 : * The page is now complete. Assign a block number to it, and add it
601 : * to the list of finished pages. (We don't write it out immediately,
602 : * because we want to WAL-log the pages in batches.)
603 : */
426 akorotkov 604 GIC 699 : blkno = state->pages_allocated++;
605 699 : state->ready_blknos[state->ready_num_pages] = blkno;
426 akorotkov 606 CBC 699 : state->ready_pages[state->ready_num_pages] = target;
607 699 : state->ready_num_pages++;
608 699 : ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
426 akorotkov 609 ECB :
610 : /*
611 : * Set the right link to point to the previous page. This is just for
612 : * debugging purposes: GiST only follows the right link if a page is
613 : * split concurrently to a scan, and that cannot happen during index
614 : * build.
615 : *
616 : * It's a bit counterintuitive that we set the right link on the new
617 : * page to point to the previous page, not the other way around. But
618 : * GiST pages are not ordered like B-tree pages are, so as long as the
619 : * right-links form a chain through all the pages at the same level,
620 : * the order doesn't matter.
621 : */
426 akorotkov 622 GIC 699 : if (levelstate->last_blkno)
623 686 : GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
426 akorotkov 624 CBC 699 : levelstate->last_blkno = blkno;
426 akorotkov 625 ECB :
626 : /*
627 : * Insert the downlink to the parent page. If this was the root,
628 : * create a new page as the parent, which becomes the new root.
629 : */
426 akorotkov 630 GIC 699 : parent = levelstate->parent;
631 699 : if (parent == NULL)
426 akorotkov 632 ECB : {
426 akorotkov 633 CBC 13 : parent = palloc0(sizeof(GistSortedBuildLevelState));
1 tmunro 634 GNC 13 : parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
426 akorotkov 635 CBC 13 : parent->parent = NULL;
636 13 : gistinitpage(parent->pages[0], 0);
426 akorotkov 637 ECB :
426 akorotkov 638 CBC 13 : levelstate->parent = parent;
639 : }
640 699 : gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
641 : }
934 heikki.linnakangas 642 140 : }
643 :
934 heikki.linnakangas 644 ECB : static void
934 heikki.linnakangas 645 GIC 87 : gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
646 : {
934 heikki.linnakangas 647 CBC 87 : if (state->ready_num_pages == 0)
934 heikki.linnakangas 648 GIC 56 : return;
934 heikki.linnakangas 649 ECB :
934 heikki.linnakangas 650 CBC 730 : for (int i = 0; i < state->ready_num_pages; i++)
651 : {
652 699 : Page page = state->ready_pages[i];
930 heikki.linnakangas 653 GIC 699 : BlockNumber blkno = state->ready_blknos[i];
934 heikki.linnakangas 654 ECB :
655 : /* Currently, the blocks must be buffered in order. */
930 heikki.linnakangas 656 GIC 699 : if (blkno != state->pages_written)
934 heikki.linnakangas 657 UIC 0 : elog(ERROR, "unexpected block number to flush GiST sorting build");
934 heikki.linnakangas 658 ECB :
934 heikki.linnakangas 659 GBC 699 : PageSetLSN(page, GistBuildLSN);
930 heikki.linnakangas 660 GIC 699 : PageSetChecksumInplace(page, blkno);
636 tgl 661 CBC 699 : smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
636 tgl 662 ECB : true);
934 heikki.linnakangas 663 :
930 heikki.linnakangas 664 GIC 699 : state->pages_written++;
665 : }
934 heikki.linnakangas 666 ECB :
934 heikki.linnakangas 667 GIC 31 : if (RelationNeedsWAL(state->indexrel))
277 rhaas 668 GNC 19 : log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
934 heikki.linnakangas 669 CBC 19 : state->ready_blknos, state->ready_pages, true);
934 heikki.linnakangas 670 ECB :
934 heikki.linnakangas 671 CBC 730 : for (int i = 0; i < state->ready_num_pages; i++)
934 heikki.linnakangas 672 GIC 699 : pfree(state->ready_pages[i]);
934 heikki.linnakangas 673 ECB :
934 heikki.linnakangas 674 CBC 31 : state->ready_num_pages = 0;
675 : }
4231 heikki.linnakangas 676 ECB :
677 :
678 : /*-------------------------------------------------------------------------
679 : * Routines for non-sorted build
680 : *-------------------------------------------------------------------------
681 : */
682 :
683 : /*
684 : * Attempt to switch to buffering mode.
685 : *
686 : * If there is not enough memory for buffering build, sets bufferingMode
687 : * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
688 : * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
689 : * GIST_BUFFERING_ACTIVE.
690 : */
691 : static void
4231 heikki.linnakangas 692 GIC 3 : gistInitBuffering(GISTBuildState *buildstate)
693 : {
4231 heikki.linnakangas 694 CBC 3 : Relation index = buildstate->indexrel;
695 : int pagesPerBuffer;
4231 heikki.linnakangas 696 ECB : Size pageFreeSpace;
697 : Size itupAvgSize,
698 : itupMinSize;
699 : double avgIndexTuplesPerPage,
700 : maxIndexTuplesPerPage;
701 : int i;
702 : int levelStep;
703 :
704 : /* Calc space of index page which is available for index tuples */
4231 heikki.linnakangas 705 GIC 3 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
706 : - sizeof(ItemIdData)
4231 heikki.linnakangas 707 CBC 3 : - buildstate->freespace;
708 :
4231 heikki.linnakangas 709 ECB : /*
710 : * Calculate average size of already inserted index tuples using gathered
711 : * statistics.
712 : */
4231 heikki.linnakangas 713 GIC 3 : itupAvgSize = (double) buildstate->indtuplesSize /
714 3 : (double) buildstate->indtuples;
4231 heikki.linnakangas 715 ECB :
716 : /*
717 : * Calculate minimal possible size of index tuple by index metadata.
718 : * Minimal possible size of varlena is VARHDRSZ.
719 : *
720 : * XXX: that's not actually true, as a short varlen can be just 2 bytes.
721 : * And we should take padding into account here.
722 : */
4231 heikki.linnakangas 723 GIC 3 : itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
724 6 : for (i = 0; i < index->rd_att->natts; i++)
4231 heikki.linnakangas 725 ECB : {
2058 andres 726 CBC 3 : if (TupleDescAttr(index->rd_att, i)->attlen < 0)
4231 heikki.linnakangas 727 UIC 0 : itupMinSize += VARHDRSZ;
4231 heikki.linnakangas 728 ECB : else
2058 andres 729 GBC 3 : itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
730 : }
4231 heikki.linnakangas 731 ECB :
732 : /* Calculate average and maximal number of index tuples which fit to page */
4231 heikki.linnakangas 733 GIC 3 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
734 3 : maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
4231 heikki.linnakangas 735 ECB :
736 : /*
737 : * We need to calculate two parameters for the buffering algorithm:
738 : * levelStep and pagesPerBuffer.
739 : *
740 : * levelStep determines the size of subtree that we operate on, while
741 : * emptying a buffer. A higher value is better, as you need fewer buffer
742 : * emptying steps to build the index. However, if you set it too high, the
743 : * subtree doesn't fit in cache anymore, and you quickly lose the benefit
744 : * of the buffers.
745 : *
746 : * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
747 : * the number of tuples on page (ie. fanout), and M is the amount of
748 : * internal memory available. Curiously, they doesn't explain *why* that
749 : * setting is optimal. We calculate it by taking the highest levelStep so
750 : * that a subtree still fits in cache. For a small B, our way of
751 : * calculating levelStep is very close to Arge et al's formula. For a
752 : * large B, our formula gives a value that is 2x higher.
753 : *
754 : * The average size (in pages) of a subtree of depth n can be calculated
755 : * as a geometric series:
756 : *
757 : * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
758 : *
759 : * where B is the average number of index tuples on page. The subtree is
760 : * cached in the shared buffer cache and the OS cache, so we choose
761 : * levelStep so that the subtree size is comfortably smaller than
762 : * effective_cache_size, with a safety factor of 4.
763 : *
764 : * The estimate on the average number of index tuples on page is based on
765 : * average tuple sizes observed before switching to buffered build, so the
766 : * real subtree size can be somewhat larger. Also, it would selfish to
767 : * gobble the whole cache for our index build. The safety factor of 4
768 : * should account for those effects.
769 : *
770 : * The other limiting factor for setting levelStep is that while
771 : * processing a subtree, we need to hold one page for each buffer at the
772 : * next lower buffered level. The max. number of buffers needed for that
773 : * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
774 : * hopefully maintenance_work_mem is set high enough that you're
775 : * constrained by effective_cache_size rather than maintenance_work_mem.
776 : *
777 : * XXX: the buffer hash table consumes a fair amount of memory too per
778 : * buffer, but that is not currently taken into account. That scales on
779 : * the total number of buffers used, ie. the index size and on levelStep.
780 : * Note that a higher levelStep *reduces* the amount of memory needed for
781 : * the hash table.
782 : */
4231 heikki.linnakangas 783 GIC 3 : levelStep = 1;
784 : for (;;)
4231 heikki.linnakangas 785 CBC 3 : {
786 : double subtreesize;
3967 heikki.linnakangas 787 ECB : double maxlowestlevelpages;
788 :
789 : /* size of an average subtree at this levelStep (in pages). */
3967 heikki.linnakangas 790 GIC 6 : subtreesize =
791 6 : (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
3967 heikki.linnakangas 792 CBC 6 : (1 - avgIndexTuplesPerPage);
3967 heikki.linnakangas 793 ECB :
794 : /* max number of pages at the lowest level of a subtree */
3967 heikki.linnakangas 795 GIC 6 : maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
796 :
3967 heikki.linnakangas 797 ECB : /* subtree must fit in cache (with safety factor of 4) */
3967 heikki.linnakangas 798 GIC 6 : if (subtreesize > effective_cache_size / 4)
3967 heikki.linnakangas 799 UIC 0 : break;
3967 heikki.linnakangas 800 ECB :
3967 heikki.linnakangas 801 EUB : /* each node in the lowest level of a subtree has one page in memory */
3967 heikki.linnakangas 802 GIC 6 : if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
803 3 : break;
3967 heikki.linnakangas 804 ECB :
805 : /* Good, we can handle this levelStep. See if we can go one higher. */
4231 heikki.linnakangas 806 GIC 3 : levelStep++;
807 : }
4231 heikki.linnakangas 808 ECB :
809 : /*
810 : * We just reached an unacceptable value of levelStep in previous loop.
811 : * So, decrease levelStep to get last acceptable value.
812 : */
4231 heikki.linnakangas 813 GIC 3 : levelStep--;
814 :
4231 heikki.linnakangas 815 ECB : /*
816 : * If there's not enough cache or maintenance_work_mem, fall back to plain
817 : * inserts.
818 : */
4231 heikki.linnakangas 819 GIC 3 : if (levelStep <= 0)
820 : {
4231 heikki.linnakangas 821 LBC 0 : elog(DEBUG1, "failed to switch to buffered GiST build");
934 heikki.linnakangas 822 UIC 0 : buildstate->buildMode = GIST_BUFFERING_DISABLED;
4231 heikki.linnakangas 823 UBC 0 : return;
4231 heikki.linnakangas 824 EUB : }
825 :
826 : /*
827 : * The second parameter to set is pagesPerBuffer, which determines the
828 : * size of each buffer. We adjust pagesPerBuffer also during the build,
829 : * which is why this calculation is in a separate function.
830 : */
4231 heikki.linnakangas 831 GIC 3 : pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
832 :
4231 heikki.linnakangas 833 ECB : /* Initialize GISTBuildBuffers with these parameters */
4231 heikki.linnakangas 834 GIC 3 : buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
835 : gistGetMaxLevel(index));
4231 heikki.linnakangas 836 ECB :
3966 heikki.linnakangas 837 GIC 3 : gistInitParentMap(buildstate);
838 :
934 heikki.linnakangas 839 CBC 3 : buildstate->buildMode = GIST_BUFFERING_ACTIVE;
840 :
4231 841 3 : elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
842 : levelStep, pagesPerBuffer);
4231 heikki.linnakangas 843 ECB : }
844 :
845 : /*
846 : * Calculate pagesPerBuffer parameter for the buffering algorithm.
847 : *
848 : * Buffer size is chosen so that assuming that tuples are distributed
849 : * randomly, emptying half a buffer fills on average one page in every buffer
850 : * at the next lower level.
851 : */
852 : static int
4231 heikki.linnakangas 853 GIC 6 : calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
854 : {
4231 heikki.linnakangas 855 ECB : double pagesPerBuffer;
856 : double avgIndexTuplesPerPage;
857 : double itupAvgSize;
858 : Size pageFreeSpace;
859 :
860 : /* Calc space of index page which is available for index tuples */
4231 heikki.linnakangas 861 GIC 6 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
862 : - sizeof(ItemIdData)
4231 heikki.linnakangas 863 CBC 6 : - buildstate->freespace;
864 :
4231 heikki.linnakangas 865 ECB : /*
866 : * Calculate average size of already inserted index tuples using gathered
867 : * statistics.
868 : */
4231 heikki.linnakangas 869 GIC 6 : itupAvgSize = (double) buildstate->indtuplesSize /
870 6 : (double) buildstate->indtuples;
4231 heikki.linnakangas 871 ECB :
4231 heikki.linnakangas 872 CBC 6 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
873 :
4231 heikki.linnakangas 874 ECB : /*
875 : * Recalculate required size of buffers.
876 : */
4231 heikki.linnakangas 877 GIC 6 : pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
878 :
4231 tgl 879 CBC 6 : return (int) rint(pagesPerBuffer);
880 : }
4231 heikki.linnakangas 881 ECB :
882 : /*
883 : * Per-tuple callback for table_index_build_scan.
884 : */
885 : static void
4231 heikki.linnakangas 886 GIC 321974 : gistBuildCallback(Relation index,
887 : ItemPointer tid,
4231 heikki.linnakangas 888 ECB : Datum *values,
889 : bool *isnull,
890 : bool tupleIsAlive,
891 : void *state)
892 : {
4231 heikki.linnakangas 893 GIC 321974 : GISTBuildState *buildstate = (GISTBuildState *) state;
894 : IndexTuple itup;
4231 heikki.linnakangas 895 ECB : MemoryContext oldCtx;
896 :
4209 tgl 897 GIC 321974 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
898 :
4231 heikki.linnakangas 899 ECB : /* form an index tuple and point it at the heap tuple */
934 heikki.linnakangas 900 GIC 321974 : itup = gistFormTuple(buildstate->giststate, index,
901 : values, isnull,
934 heikki.linnakangas 902 ECB : true);
1248 andres 903 GIC 321974 : itup->t_tid = *tid;
904 :
11 tgl 905 ECB : /* Update tuple count and total size. */
11 tgl 906 GIC 321974 : buildstate->indtuples += 1;
907 321974 : buildstate->indtuplesSize += IndexTupleSize(itup);
11 tgl 908 ECB :
909 : /*
910 : * XXX In buffering builds, the tempCxt is also reset down inside
911 : * gistProcessEmptyingQueue(). This is not great because it risks
912 : * confusion and possible use of dangling pointers (for example, itup
913 : * might be already freed when control returns here). It's generally
914 : * better that a memory context be "owned" by only one function. However,
915 : * currently this isn't causing issues so it doesn't seem worth the amount
916 : * of refactoring that would be needed to avoid it.
917 : */
934 heikki.linnakangas 918 GIC 321974 : if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
919 : {
4231 heikki.linnakangas 920 ECB : /* We have buffers, so use them. */
4231 heikki.linnakangas 921 GIC 17715 : gistBufferingBuildInsert(buildstate, itup);
922 : }
4231 heikki.linnakangas 923 ECB : else
924 : {
925 : /*
926 : * There's no buffers (yet). Since we already have the index relation
927 : * locked, we call gistdoinsert directly.
928 : */
4231 heikki.linnakangas 929 GIC 304259 : gistdoinsert(index, itup, buildstate->freespace,
930 : buildstate->giststate, buildstate->heaprel, true);
4231 heikki.linnakangas 931 ECB : }
932 :
4231 heikki.linnakangas 933 GIC 321974 : MemoryContextSwitchTo(oldCtx);
4209 tgl 934 321974 : MemoryContextReset(buildstate->giststate->tempCxt);
4231 heikki.linnakangas 935 ECB :
934 heikki.linnakangas 936 CBC 321974 : if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
4231 heikki.linnakangas 937 GIC 17715 : buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
4231 heikki.linnakangas 938 ECB : {
939 : /* Adjust the target buffer size now */
4231 heikki.linnakangas 940 GIC 3 : buildstate->gfbb->pagesPerBuffer =
941 3 : calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
4231 heikki.linnakangas 942 ECB : }
943 :
944 : /*
945 : * In 'auto' mode, check if the index has grown too large to fit in cache,
946 : * and switch to buffering mode if it has.
947 : *
948 : * To avoid excessive calls to smgrnblocks(), only check this every
949 : * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
950 : *
951 : * In 'stats' state, switch as soon as we have seen enough tuples to have
952 : * some idea of the average tuple size.
953 : */
934 heikki.linnakangas 954 GIC 321974 : if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
4231 955 291971 : buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
636 tgl 956 CBC 1098 : effective_cache_size < smgrnblocks(RelationGetSmgr(index),
957 321974 : MAIN_FORKNUM)) ||
934 heikki.linnakangas 958 321974 : (buildstate->buildMode == GIST_BUFFERING_STATS &&
4231 959 12288 : buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
4231 heikki.linnakangas 960 ECB : {
961 : /*
962 : * Index doesn't fit in effective cache anymore. Try to switch to
963 : * buffering build mode.
964 : */
4231 heikki.linnakangas 965 GIC 3 : gistInitBuffering(buildstate);
966 : }
4231 heikki.linnakangas 967 CBC 321974 : }
968 :
4231 heikki.linnakangas 969 ECB : /*
970 : * Insert function for buffering index build.
971 : */
972 : static void
4231 heikki.linnakangas 973 GIC 17715 : gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
974 : {
4231 heikki.linnakangas 975 ECB : /* Insert the tuple to buffers. */
3966 heikki.linnakangas 976 GIC 17715 : gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
977 :
4231 heikki.linnakangas 978 ECB : /* If we filled up (half of a) buffer, process buffer emptying. */
4231 heikki.linnakangas 979 GIC 17715 : gistProcessEmptyingQueue(buildstate);
980 17715 : }
4231 heikki.linnakangas 981 ECB :
982 : /*
983 : * Process an index tuple. Runs the tuple down the tree until we reach a leaf
984 : * page or node buffer, and inserts the tuple there. Returns true if we have
985 : * to stop buffer emptying process (because one of child buffers can't take
986 : * index tuples anymore).
987 : */
988 : static bool
4231 heikki.linnakangas 989 GIC 34881 : gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
990 : BlockNumber startblkno, int startlevel)
4231 heikki.linnakangas 991 ECB : {
4209 tgl 992 GIC 34881 : GISTSTATE *giststate = buildstate->giststate;
4231 heikki.linnakangas 993 34881 : GISTBuildBuffers *gfbb = buildstate->gfbb;
4231 heikki.linnakangas 994 CBC 34881 : Relation indexrel = buildstate->indexrel;
4231 heikki.linnakangas 995 ECB : BlockNumber childblkno;
996 : Buffer buffer;
4231 heikki.linnakangas 997 GIC 34881 : bool result = false;
998 : BlockNumber blkno;
3966 heikki.linnakangas 999 ECB : int level;
3966 heikki.linnakangas 1000 GIC 34881 : OffsetNumber downlinkoffnum = InvalidOffsetNumber;
1001 34881 : BlockNumber parentblkno = InvalidBlockNumber;
4231 heikki.linnakangas 1002 ECB :
3966 heikki.linnakangas 1003 CBC 34881 : CHECK_FOR_INTERRUPTS();
1004 :
4231 heikki.linnakangas 1005 ECB : /*
1006 : * Loop until we reach a leaf page (level == 0) or a level with buffers
1007 : * (not including the level we start at, because we would otherwise make
1008 : * no progress).
1009 : */
3966 heikki.linnakangas 1010 GIC 34881 : blkno = startblkno;
1011 34881 : level = startlevel;
4231 heikki.linnakangas 1012 ECB : for (;;)
4231 heikki.linnakangas 1013 CBC 34881 : {
1014 : ItemId iid;
4231 heikki.linnakangas 1015 ECB : IndexTuple idxtuple,
1016 : newtup;
1017 : Page page;
1018 : OffsetNumber childoffnum;
1019 :
1020 : /* Have we reached a level with buffers? */
3966 heikki.linnakangas 1021 GIC 69762 : if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
4231 1022 17166 : break;
4231 heikki.linnakangas 1023 ECB :
1024 : /* Have we reached a leaf page? */
3966 heikki.linnakangas 1025 GIC 52596 : if (level == 0)
4231 1026 17715 : break;
4231 heikki.linnakangas 1027 ECB :
1028 : /*
1029 : * Nope. Descend down to the next level then. Choose a child to
1030 : * descend down to.
1031 : */
1032 :
3966 heikki.linnakangas 1033 GIC 34881 : buffer = ReadBuffer(indexrel, blkno);
4231 1034 34881 : LockBuffer(buffer, GIST_EXCLUSIVE);
4231 heikki.linnakangas 1035 ECB :
2545 kgrittn 1036 CBC 34881 : page = (Page) BufferGetPage(buffer);
4231 heikki.linnakangas 1037 GIC 34881 : childoffnum = gistchoose(indexrel, page, itup, giststate);
4231 heikki.linnakangas 1038 CBC 34881 : iid = PageGetItemId(page, childoffnum);
1039 34881 : idxtuple = (IndexTuple) PageGetItem(page, iid);
1040 34881 : childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
4231 heikki.linnakangas 1041 ECB :
3966 heikki.linnakangas 1042 CBC 34881 : if (level > 1)
3966 heikki.linnakangas 1043 GIC 17166 : gistMemorizeParent(buildstate, childblkno, blkno);
3966 heikki.linnakangas 1044 ECB :
4231 1045 : /*
1046 : * Check that the key representing the target child node is consistent
1047 : * with the key we're inserting. Update it if it's not.
1048 : */
4231 heikki.linnakangas 1049 GIC 34881 : newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
1050 34881 : if (newtup)
3966 heikki.linnakangas 1051 ECB : {
3599 sfrost 1052 CBC 34473 : blkno = gistbufferinginserttuples(buildstate,
1053 : buffer,
3599 sfrost 1054 ECB : level,
1055 : &newtup,
1056 : 1,
1057 : childoffnum,
1058 : InvalidBlockNumber,
1059 : InvalidOffsetNumber);
1060 : /* gistbufferinginserttuples() released the buffer */
1061 : }
1062 : else
3966 heikki.linnakangas 1063 GIC 408 : UnlockReleaseBuffer(buffer);
1064 :
3966 heikki.linnakangas 1065 ECB : /* Descend to the child */
3966 heikki.linnakangas 1066 GIC 34881 : parentblkno = blkno;
1067 34881 : blkno = childblkno;
3966 heikki.linnakangas 1068 CBC 34881 : downlinkoffnum = childoffnum;
1069 34881 : Assert(level > 0);
1070 34881 : level--;
4231 heikki.linnakangas 1071 ECB : }
1072 :
3966 heikki.linnakangas 1073 GIC 34881 : if (LEVEL_HAS_BUFFERS(level, gfbb))
4231 1074 17166 : {
4231 heikki.linnakangas 1075 ECB : /*
1076 : * We've reached level with buffers. Place the index tuple to the
1077 : * buffer, and add the buffer to the emptying queue if it overflows.
1078 : */
1079 : GISTNodeBuffer *childNodeBuffer;
1080 :
1081 : /* Find the buffer or create a new one */
3966 heikki.linnakangas 1082 GIC 17166 : childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1083 :
4231 heikki.linnakangas 1084 ECB : /* Add index tuple to it */
4231 heikki.linnakangas 1085 GIC 17166 : gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1086 :
4231 heikki.linnakangas 1087 CBC 17166 : if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
4231 heikki.linnakangas 1088 UIC 0 : result = true;
4231 heikki.linnakangas 1089 ECB : }
4231 heikki.linnakangas 1090 EUB : else
1091 : {
1092 : /*
1093 : * We've reached a leaf page. Place the tuple here.
1094 : */
3966 heikki.linnakangas 1095 GIC 17715 : Assert(level == 0);
1096 17715 : buffer = ReadBuffer(indexrel, blkno);
4231 heikki.linnakangas 1097 CBC 17715 : LockBuffer(buffer, GIST_EXCLUSIVE);
3966 1098 17715 : gistbufferinginserttuples(buildstate, buffer, level,
3966 heikki.linnakangas 1099 ECB : &itup, 1, InvalidOffsetNumber,
1100 : parentblkno, downlinkoffnum);
1101 : /* gistbufferinginserttuples() released the buffer */
1102 : }
1103 :
4231 heikki.linnakangas 1104 GIC 34881 : return result;
1105 : }
4231 heikki.linnakangas 1106 ECB :
1107 : /*
1108 : * Insert tuples to a given page.
1109 : *
1110 : * This is analogous with gistinserttuples() in the regular insertion code.
1111 : *
1112 : * Returns the block number of the page where the (first) new or updated tuple
1113 : * was inserted. Usually that's the original page, but might be a sibling page
1114 : * if the original page was split.
1115 : *
1116 : * Caller should hold a lock on 'buffer' on entry. This function will unlock
1117 : * and unpin it.
1118 : */
1119 : static BlockNumber
3966 heikki.linnakangas 1120 GIC 52572 : gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1121 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
3966 heikki.linnakangas 1122 ECB : BlockNumber parentblk, OffsetNumber downlinkoffnum)
1123 : {
4231 heikki.linnakangas 1124 GIC 52572 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1125 : List *splitinfo;
4231 heikki.linnakangas 1126 ECB : bool is_split;
3602 bruce 1127 GIC 52572 : BlockNumber placed_to_blk = InvalidBlockNumber;
1128 :
4231 heikki.linnakangas 1129 CBC 52572 : is_split = gistplacetopage(buildstate->indexrel,
1130 : buildstate->freespace,
4209 tgl 1131 ECB : buildstate->giststate,
1132 : buffer,
1133 : itup, ntup, oldoffnum, &placed_to_blk,
1134 : InvalidBuffer,
1135 : &splitinfo,
1136 : false,
1137 : buildstate->heaprel, true);
1138 :
1139 : /*
1140 : * If this is a root split, update the root path item kept in memory. This
1141 : * ensures that all path stacks are always complete, including all parent
1142 : * nodes up to the root. That simplifies the algorithm to re-find correct
1143 : * parent.
1144 : */
4231 heikki.linnakangas 1145 GIC 52572 : if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1146 : {
2545 kgrittn 1147 CBC 3 : Page page = BufferGetPage(buffer);
1148 : OffsetNumber off;
3966 heikki.linnakangas 1149 ECB : OffsetNumber maxoff;
1150 :
3966 heikki.linnakangas 1151 GIC 3 : Assert(level == gfbb->rootlevel);
1152 3 : gfbb->rootlevel++;
3966 heikki.linnakangas 1153 ECB :
3966 heikki.linnakangas 1154 CBC 3 : elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1155 :
4231 heikki.linnakangas 1156 ECB : /*
1157 : * All the downlinks on the old root page are now on one of the child
1158 : * pages. Visit all the new child pages to memorize the parents of the
1159 : * grandchildren.
1160 : */
3966 heikki.linnakangas 1161 GIC 3 : if (gfbb->rootlevel > 1)
1162 : {
3966 heikki.linnakangas 1163 CBC 3 : maxoff = PageGetMaxOffsetNumber(page);
3966 heikki.linnakangas 1164 GIC 9 : for (off = FirstOffsetNumber; off <= maxoff; off++)
3966 heikki.linnakangas 1165 ECB : {
3955 bruce 1166 CBC 6 : ItemId iid = PageGetItemId(page, off);
3955 bruce 1167 GIC 6 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
3966 heikki.linnakangas 1168 CBC 6 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
3955 bruce 1169 6 : Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
4231 heikki.linnakangas 1170 ECB :
3966 heikki.linnakangas 1171 CBC 6 : LockBuffer(childbuf, GIST_SHARE);
3966 heikki.linnakangas 1172 GIC 6 : gistMemorizeAllDownlinks(buildstate, childbuf);
3966 heikki.linnakangas 1173 CBC 6 : UnlockReleaseBuffer(childbuf);
3966 heikki.linnakangas 1174 ECB :
1175 : /*
1176 : * Also remember that the parent of the new child page is the
1177 : * root block.
1178 : */
3966 heikki.linnakangas 1179 GIC 6 : gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1180 : }
3966 heikki.linnakangas 1181 ECB : }
1182 : }
1183 :
4231 heikki.linnakangas 1184 GIC 52572 : if (splitinfo)
1185 : {
4231 heikki.linnakangas 1186 ECB : /*
1187 : * Insert the downlinks to the parent. This is analogous with
1188 : * gistfinishsplit() in the regular insertion code, but the locking is
1189 : * simpler, and we have to maintain the buffers on internal nodes and
1190 : * the parent map.
1191 : */
1192 : IndexTuple *downlinks;
1193 : int ndownlinks,
1194 : i;
1195 : Buffer parentBuffer;
1196 : ListCell *lc;
1197 :
1198 : /* Parent may have changed since we memorized this path. */
1199 : parentBuffer =
3966 heikki.linnakangas 1200 GIC 384 : gistBufferingFindCorrectParent(buildstate,
1201 : BufferGetBlockNumber(buffer),
3966 heikki.linnakangas 1202 ECB : level,
1203 : &parentblk,
1204 : &downlinkoffnum);
1205 :
1206 : /*
1207 : * If there's a buffer associated with this page, that needs to be
1208 : * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1209 : * downlinks in 'splitinfo', to make sure they're consistent not only
1210 : * with the tuples already on the pages, but also the tuples in the
1211 : * buffers that will eventually be inserted to them.
1212 : */
4231 heikki.linnakangas 1213 GIC 384 : gistRelocateBuildBuffersOnSplit(gfbb,
1214 : buildstate->giststate,
4231 heikki.linnakangas 1215 ECB : buildstate->indexrel,
1216 : level,
1217 : buffer, splitinfo);
1218 :
1219 : /* Create an array of all the downlink tuples */
4231 heikki.linnakangas 1220 GIC 384 : ndownlinks = list_length(splitinfo);
1221 384 : downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
4231 heikki.linnakangas 1222 CBC 384 : i = 0;
1223 1152 : foreach(lc, splitinfo)
4231 heikki.linnakangas 1224 ECB : {
4231 heikki.linnakangas 1225 CBC 768 : GISTPageSplitInfo *splitinfo = lfirst(lc);
1226 :
3966 heikki.linnakangas 1227 ECB : /*
1228 : * Remember the parent of each new child page in our parent map.
1229 : * This assumes that the downlinks fit on the parent page. If the
1230 : * parent page is split, too, when we recurse up to insert the
1231 : * downlinks, the recursive gistbufferinginserttuples() call will
1232 : * update the map again.
1233 : */
3966 heikki.linnakangas 1234 GIC 768 : if (level > 0)
1235 12 : gistMemorizeParent(buildstate,
3966 heikki.linnakangas 1236 ECB : BufferGetBlockNumber(splitinfo->buf),
1237 : BufferGetBlockNumber(parentBuffer));
1238 :
1239 : /*
1240 : * Also update the parent map for all the downlinks that got moved
1241 : * to a different page. (actually this also loops through the
1242 : * downlinks that stayed on the original page, but it does no
1243 : * harm).
1244 : */
3966 heikki.linnakangas 1245 GIC 768 : if (level > 1)
3966 heikki.linnakangas 1246 UIC 0 : gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
3966 heikki.linnakangas 1247 ECB :
4231 heikki.linnakangas 1248 EUB : /*
1249 : * Since there's no concurrent access, we can release the lower
1250 : * level buffers immediately. This includes the original page.
1251 : */
3966 heikki.linnakangas 1252 GIC 768 : UnlockReleaseBuffer(splitinfo->buf);
4231 1253 768 : downlinks[i++] = splitinfo->downlink;
4231 heikki.linnakangas 1254 ECB : }
1255 :
1256 : /* Insert them into parent. */
3966 heikki.linnakangas 1257 GIC 384 : gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1258 : downlinks, ndownlinks, downlinkoffnum,
3966 heikki.linnakangas 1259 ECB : InvalidBlockNumber, InvalidOffsetNumber);
1260 :
2118 tgl 1261 GIC 384 : list_free_deep(splitinfo); /* we don't need this anymore */
1262 : }
3966 heikki.linnakangas 1263 ECB : else
3966 heikki.linnakangas 1264 GIC 52188 : UnlockReleaseBuffer(buffer);
1265 :
3888 heikki.linnakangas 1266 CBC 52572 : return placed_to_blk;
1267 : }
4231 heikki.linnakangas 1268 ECB :
1269 : /*
1270 : * Find the downlink pointing to a child page.
1271 : *
1272 : * 'childblkno' indicates the child page to find the parent for. 'level' is
1273 : * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1274 : * point to a location where the downlink used to be - we will check that
1275 : * location first, and save some cycles if it hasn't moved. The function
1276 : * returns a buffer containing the downlink, exclusively-locked, and
1277 : * *parentblkno and *downlinkoffnum are set to the real location of the
1278 : * downlink.
1279 : *
1280 : * If the child page is a leaf (level == 0), the caller must supply a correct
1281 : * parentblkno. Otherwise we use the parent map hash table to find the parent
1282 : * block.
1283 : *
1284 : * This function serves the same purpose as gistFindCorrectParent() during
1285 : * normal index inserts, but this is simpler because we don't need to deal
1286 : * with concurrent inserts.
1287 : */
1288 : static Buffer
4231 heikki.linnakangas 1289 GIC 384 : gistBufferingFindCorrectParent(GISTBuildState *buildstate,
1290 : BlockNumber childblkno, int level,
3966 heikki.linnakangas 1291 ECB : BlockNumber *parentblkno,
1292 : OffsetNumber *downlinkoffnum)
1293 : {
1294 : BlockNumber parent;
1295 : Buffer buffer;
1296 : Page page;
1297 : OffsetNumber maxoff;
1298 : OffsetNumber off;
1299 :
3966 heikki.linnakangas 1300 GIC 384 : if (level > 0)
1301 6 : parent = gistGetParent(buildstate, childblkno);
3966 heikki.linnakangas 1302 ECB : else
1303 : {
1304 : /*
1305 : * For a leaf page, the caller must supply a correct parent block
1306 : * number.
1307 : */
3966 heikki.linnakangas 1308 GIC 378 : if (*parentblkno == InvalidBlockNumber)
722 peter 1309 UIC 0 : elog(ERROR, "no parent buffer provided of child %u", childblkno);
3966 heikki.linnakangas 1310 CBC 378 : parent = *parentblkno;
3966 heikki.linnakangas 1311 EUB : }
3966 heikki.linnakangas 1312 ECB :
3966 heikki.linnakangas 1313 GIC 384 : buffer = ReadBuffer(buildstate->indexrel, parent);
2545 kgrittn 1314 384 : page = BufferGetPage(buffer);
4231 heikki.linnakangas 1315 CBC 384 : LockBuffer(buffer, GIST_EXCLUSIVE);
3966 1316 384 : gistcheckpage(buildstate->indexrel, buffer);
1317 384 : maxoff = PageGetMaxOffsetNumber(page);
4231 heikki.linnakangas 1318 ECB :
1319 : /* Check if it was not moved */
3966 heikki.linnakangas 1320 GIC 384 : if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1321 378 : *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
4231 heikki.linnakangas 1322 ECB : {
3955 bruce 1323 CBC 378 : ItemId iid = PageGetItemId(page, *downlinkoffnum);
3955 bruce 1324 GIC 378 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
3955 bruce 1325 ECB :
3966 heikki.linnakangas 1326 CBC 378 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1327 : {
4231 heikki.linnakangas 1328 ECB : /* Still there */
3966 heikki.linnakangas 1329 GIC 378 : return buffer;
1330 : }
4231 heikki.linnakangas 1331 ECB : }
1332 :
1333 : /*
1334 : * Downlink was not at the offset where it used to be. Scan the page to
1335 : * find it. During normal gist insertions, it might've moved to another
1336 : * page, to the right, but during a buffering build, we keep track of the
1337 : * parent of each page in the lookup table so we should always know what
1338 : * page it's on.
1339 : */
3966 heikki.linnakangas 1340 GIC 15 : for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1341 : {
3955 bruce 1342 CBC 15 : ItemId iid = PageGetItemId(page, off);
3955 bruce 1343 GIC 15 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
3955 bruce 1344 ECB :
3966 heikki.linnakangas 1345 CBC 15 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1346 : {
3966 heikki.linnakangas 1347 ECB : /* yes!!, found it */
3966 heikki.linnakangas 1348 GIC 6 : *downlinkoffnum = off;
1349 6 : return buffer;
4231 heikki.linnakangas 1350 ECB : }
1351 : }
1352 :
3966 heikki.linnakangas 1353 UIC 0 : elog(ERROR, "failed to re-find parent for block %u", childblkno);
1354 : return InvalidBuffer; /* keep compiler quiet */
4231 heikki.linnakangas 1355 EUB : }
1356 :
1357 : /*
1358 : * Process buffers emptying stack. Emptying of one buffer can cause emptying
1359 : * of other buffers. This function iterates until this cascading emptying
1360 : * process finished, e.g. until buffers emptying stack is empty.
1361 : */
1362 : static void
4231 heikki.linnakangas 1363 GIC 17724 : gistProcessEmptyingQueue(GISTBuildState *buildstate)
1364 : {
4231 heikki.linnakangas 1365 CBC 17724 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1366 :
4231 heikki.linnakangas 1367 ECB : /* Iterate while we have elements in buffers emptying stack. */
4231 heikki.linnakangas 1368 GIC 17733 : while (gfbb->bufferEmptyingQueue != NIL)
1369 : {
4231 heikki.linnakangas 1370 ECB : GISTNodeBuffer *emptyingNodeBuffer;
1371 :
1372 : /* Get node buffer from emptying stack. */
4231 heikki.linnakangas 1373 GIC 9 : emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1374 9 : gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
4231 heikki.linnakangas 1375 CBC 9 : emptyingNodeBuffer->queuedForEmptying = false;
4231 heikki.linnakangas 1376 ECB :
1377 : /*
1378 : * We are going to load last pages of buffers where emptying will be
1379 : * to. So let's unload any previously loaded buffers.
1380 : */
4231 heikki.linnakangas 1381 GIC 9 : gistUnloadNodeBuffers(gfbb);
1382 :
4231 heikki.linnakangas 1383 ECB : /*
1384 : * Pop tuples from the buffer and run them down to the buffers at
1385 : * lower level, or leaf pages. We continue until one of the lower
1386 : * level buffers fills up, or this buffer runs empty.
1387 : *
1388 : * In Arge et al's paper, the buffer emptying is stopped after
1389 : * processing 1/2 node buffer worth of tuples, to avoid overfilling
1390 : * any of the lower level buffers. However, it's more efficient to
1391 : * keep going until one of the lower level buffers actually fills up,
1392 : * so that's what we do. This doesn't need to be exact, if a buffer
1393 : * overfills by a few tuples, there's no harm done.
1394 : */
1395 : while (true)
4231 heikki.linnakangas 1396 GIC 17166 : {
1397 : IndexTuple itup;
4231 heikki.linnakangas 1398 ECB :
1399 : /* Get next index tuple from the buffer */
4231 heikki.linnakangas 1400 GIC 17175 : if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1401 9 : break;
4231 heikki.linnakangas 1402 ECB :
1403 : /*
1404 : * Run it down to the underlying node buffer or leaf page.
1405 : *
1406 : * Note: it's possible that the buffer we're emptying splits as a
1407 : * result of this call. If that happens, our emptyingNodeBuffer
1408 : * points to the left half of the split. After split, it's very
1409 : * likely that the new left buffer is no longer over the half-full
1410 : * threshold, but we might as well keep flushing tuples from it
1411 : * until we fill a lower-level buffer.
1412 : */
3966 heikki.linnakangas 1413 GIC 17166 : if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1414 : {
4231 heikki.linnakangas 1415 ECB : /*
1416 : * A lower level buffer filled up. Stop emptying this buffer,
1417 : * to avoid overflowing the lower level buffer.
1418 : */
4231 heikki.linnakangas 1419 UIC 0 : break;
1420 : }
4231 heikki.linnakangas 1421 EUB :
1422 : /* Free all the memory allocated during index tuple processing */
4209 tgl 1423 GIC 17166 : MemoryContextReset(buildstate->giststate->tempCxt);
1424 : }
4231 heikki.linnakangas 1425 ECB : }
4231 heikki.linnakangas 1426 GIC 17724 : }
1427 :
4231 heikki.linnakangas 1428 ECB : /*
1429 : * Empty all node buffers, from top to bottom. This is done at the end of
1430 : * index build to flush all remaining tuples to the index.
1431 : *
1432 : * Note: This destroys the buffersOnLevels lists, so the buffers should not
1433 : * be inserted to after this call.
1434 : */
1435 : static void
4231 heikki.linnakangas 1436 GIC 3 : gistEmptyAllBuffers(GISTBuildState *buildstate)
1437 : {
4231 heikki.linnakangas 1438 CBC 3 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1439 : MemoryContext oldCtx;
4231 heikki.linnakangas 1440 ECB : int i;
1441 :
4209 tgl 1442 GIC 3 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1443 :
4231 heikki.linnakangas 1444 ECB : /*
1445 : * Iterate through the levels from top to bottom.
1446 : */
4231 heikki.linnakangas 1447 GIC 9 : for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1448 : {
4231 heikki.linnakangas 1449 ECB : /*
1450 : * Empty all buffers on this level. Note that new buffers can pop up
1451 : * in the list during the processing, as a result of page splits, so a
1452 : * simple walk through the list won't work. We remove buffers from the
1453 : * list when we see them empty; a buffer can't become non-empty once
1454 : * it's been fully emptied.
1455 : */
4231 heikki.linnakangas 1456 GIC 24 : while (gfbb->buffersOnLevels[i] != NIL)
1457 : {
4231 heikki.linnakangas 1458 ECB : GISTNodeBuffer *nodeBuffer;
1459 :
4231 heikki.linnakangas 1460 GIC 18 : nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1461 :
4231 heikki.linnakangas 1462 CBC 18 : if (nodeBuffer->blocksCount != 0)
1463 : {
4231 heikki.linnakangas 1464 ECB : /*
1465 : * Add this buffer to the emptying queue, and proceed to empty
1466 : * the queue.
1467 : */
4227 heikki.linnakangas 1468 GIC 9 : if (!nodeBuffer->queuedForEmptying)
1469 : {
4227 heikki.linnakangas 1470 CBC 9 : MemoryContextSwitchTo(gfbb->context);
4227 heikki.linnakangas 1471 GIC 9 : nodeBuffer->queuedForEmptying = true;
4227 heikki.linnakangas 1472 CBC 9 : gfbb->bufferEmptyingQueue =
1473 9 : lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
4209 tgl 1474 9 : MemoryContextSwitchTo(buildstate->giststate->tempCxt);
4227 heikki.linnakangas 1475 ECB : }
4231 heikki.linnakangas 1476 CBC 9 : gistProcessEmptyingQueue(buildstate);
1477 : }
4231 heikki.linnakangas 1478 ECB : else
4231 heikki.linnakangas 1479 GIC 9 : gfbb->buffersOnLevels[i] =
1480 9 : list_delete_first(gfbb->buffersOnLevels[i]);
4231 heikki.linnakangas 1481 ECB : }
3966 heikki.linnakangas 1482 CBC 6 : elog(DEBUG2, "emptied all buffers at level %d", i);
1483 : }
4231 1484 3 : MemoryContextSwitchTo(oldCtx);
4231 heikki.linnakangas 1485 GIC 3 : }
4231 heikki.linnakangas 1486 ECB :
1487 : /*
1488 : * Get the depth of the GiST index.
1489 : */
1490 : static int
4231 heikki.linnakangas 1491 GIC 3 : gistGetMaxLevel(Relation index)
1492 : {
4231 heikki.linnakangas 1493 ECB : int maxLevel;
1494 : BlockNumber blkno;
1495 :
1496 : /*
1497 : * Traverse down the tree, starting from the root, until we hit the leaf
1498 : * level.
1499 : */
4231 heikki.linnakangas 1500 GIC 3 : maxLevel = 0;
1501 3 : blkno = GIST_ROOT_BLKNO;
4231 heikki.linnakangas 1502 ECB : while (true)
4231 heikki.linnakangas 1503 CBC 3 : {
1504 : Buffer buffer;
4231 heikki.linnakangas 1505 ECB : Page page;
1506 : IndexTuple itup;
1507 :
4231 heikki.linnakangas 1508 GIC 6 : buffer = ReadBuffer(index, blkno);
1509 :
4231 heikki.linnakangas 1510 ECB : /*
1511 : * There's no concurrent access during index build, so locking is just
1512 : * pro forma.
1513 : */
4231 heikki.linnakangas 1514 GIC 6 : LockBuffer(buffer, GIST_SHARE);
2545 kgrittn 1515 6 : page = (Page) BufferGetPage(buffer);
4231 heikki.linnakangas 1516 ECB :
4231 heikki.linnakangas 1517 CBC 6 : if (GistPageIsLeaf(page))
1518 : {
4231 heikki.linnakangas 1519 ECB : /* We hit the bottom, so we're done. */
4231 heikki.linnakangas 1520 GIC 3 : UnlockReleaseBuffer(buffer);
1521 3 : break;
4231 heikki.linnakangas 1522 ECB : }
1523 :
1524 : /*
1525 : * Pick the first downlink on the page, and follow it. It doesn't
1526 : * matter which downlink we choose, the tree has the same depth
1527 : * everywhere, so we just pick the first one.
1528 : */
4231 heikki.linnakangas 1529 GIC 3 : itup = (IndexTuple) PageGetItem(page,
1530 : PageGetItemId(page, FirstOffsetNumber));
4231 heikki.linnakangas 1531 CBC 3 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
4231 heikki.linnakangas 1532 GIC 3 : UnlockReleaseBuffer(buffer);
4231 heikki.linnakangas 1533 ECB :
1534 : /*
1535 : * We're going down on the tree. It means that there is yet one more
1536 : * level in the tree.
1537 : */
4231 heikki.linnakangas 1538 GIC 3 : maxLevel++;
1539 : }
4231 heikki.linnakangas 1540 CBC 3 : return maxLevel;
1541 : }
3966 heikki.linnakangas 1542 ECB :
1543 :
1544 : /*
1545 : * Routines for managing the parent map.
1546 : *
1547 : * Whenever a page is split, we need to insert the downlinks into the parent.
1548 : * We need to somehow find the parent page to do that. In normal insertions,
1549 : * we keep a stack of nodes visited when we descend the tree. However, in
1550 : * buffering build, we can start descending the tree from any internal node,
1551 : * when we empty a buffer by cascading tuples to its children. So we don't
1552 : * have a full stack up to the root available at that time.
1553 : *
1554 : * So instead, we maintain a hash table to track the parent of every internal
1555 : * page. We don't need to track the parents of leaf nodes, however. Whenever
1556 : * we insert to a leaf, we've just descended down from its parent, so we know
1557 : * its immediate parent already. This helps a lot to limit the memory used
1558 : * by this hash table.
1559 : *
1560 : * Whenever an internal node is split, the parent map needs to be updated.
1561 : * the parent of the new child page needs to be recorded, and also the
1562 : * entries for all page whose downlinks are moved to a new page at the split
1563 : * needs to be updated.
1564 : *
1565 : * We also update the parent map whenever we descend the tree. That might seem
1566 : * unnecessary, because we maintain the map whenever a downlink is moved or
1567 : * created, but it is needed because we switch to buffering mode after
1568 : * creating a tree with regular index inserts. Any pages created before
1569 : * switching to buffering mode will not be present in the parent map initially,
1570 : * but will be added there the first time we visit them.
1571 : */
1572 :
1573 : typedef struct
1574 : {
1575 : BlockNumber childblkno; /* hash key */
1576 : BlockNumber parentblkno;
1577 : } ParentMapEntry;
1578 :
1579 : static void
3966 heikki.linnakangas 1580 GIC 3 : gistInitParentMap(GISTBuildState *buildstate)
1581 : {
3966 heikki.linnakangas 1582 ECB : HASHCTL hashCtl;
1583 :
3966 heikki.linnakangas 1584 GIC 3 : hashCtl.keysize = sizeof(BlockNumber);
1585 3 : hashCtl.entrysize = sizeof(ParentMapEntry);
3966 heikki.linnakangas 1586 CBC 3 : hashCtl.hcxt = CurrentMemoryContext;
1587 3 : buildstate->parentMap = hash_create("gistbuild parent map",
3966 heikki.linnakangas 1588 ECB : 1024,
1589 : &hashCtl,
1590 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
3966 heikki.linnakangas 1591 GIC 3 : }
1592 :
3966 heikki.linnakangas 1593 ECB : static void
3966 heikki.linnakangas 1594 GIC 17463 : gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1595 : {
3966 heikki.linnakangas 1596 ECB : ParentMapEntry *entry;
1597 : bool found;
1598 :
3966 heikki.linnakangas 1599 GIC 17463 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1600 : &child,
3955 bruce 1601 ECB : HASH_ENTER,
1602 : &found);
3966 heikki.linnakangas 1603 GIC 17463 : entry->parentblkno = parent;
1604 17463 : }
3966 heikki.linnakangas 1605 ECB :
1606 : /*
1607 : * Scan all downlinks on a page, and memorize their parent.
1608 : */
1609 : static void
3966 heikki.linnakangas 1610 GIC 6 : gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1611 : {
3966 heikki.linnakangas 1612 ECB : OffsetNumber maxoff;
1613 : OffsetNumber off;
3966 heikki.linnakangas 1614 GIC 6 : BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
2545 kgrittn 1615 6 : Page page = BufferGetPage(parentbuf);
3966 heikki.linnakangas 1616 ECB :
3966 heikki.linnakangas 1617 CBC 6 : Assert(!GistPageIsLeaf(page));
1618 :
1619 6 : maxoff = PageGetMaxOffsetNumber(page);
3966 heikki.linnakangas 1620 GIC 285 : for (off = FirstOffsetNumber; off <= maxoff; off++)
3966 heikki.linnakangas 1621 ECB : {
3955 bruce 1622 CBC 279 : ItemId iid = PageGetItemId(page, off);
3955 bruce 1623 GIC 279 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
3966 heikki.linnakangas 1624 CBC 279 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
3955 bruce 1625 ECB :
3966 heikki.linnakangas 1626 CBC 279 : gistMemorizeParent(buildstate, childblkno, parentblkno);
1627 : }
1628 6 : }
1629 :
3966 heikki.linnakangas 1630 ECB : static BlockNumber
3966 heikki.linnakangas 1631 GIC 6 : gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1632 : {
3966 heikki.linnakangas 1633 ECB : ParentMapEntry *entry;
1634 : bool found;
1635 :
1636 : /* Find node buffer in hash table */
3966 heikki.linnakangas 1637 GIC 6 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1638 : &child,
3955 bruce 1639 ECB : HASH_FIND,
1640 : &found);
3966 heikki.linnakangas 1641 GIC 6 : if (!found)
722 peter 1642 UIC 0 : elog(ERROR, "could not find parent of block %u in lookup table", child);
3966 heikki.linnakangas 1643 ECB :
3966 heikki.linnakangas 1644 GBC 6 : return entry->parentblkno;
1645 : }
|