Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gindatapage.c
4 : * routines for handling GIN posting tree pages.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/gindatapage.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/ginxlog.h"
19 : #include "access/xloginsert.h"
20 : #include "lib/ilist.h"
21 : #include "miscadmin.h"
22 : #include "storage/predicate.h"
23 : #include "utils/rel.h"
24 :
25 : /*
26 : * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
27 : *
28 : * The code can deal with any size, but random access is more efficient when
29 : * a number of smaller lists are stored, rather than one big list. If a
30 : * posting list would become larger than Max size as a result of insertions,
31 : * it is split into two. If a posting list would be smaller than minimum
32 : * size, it is merged with the next posting list.
33 : */
34 : #define GinPostingListSegmentMaxSize 384
35 : #define GinPostingListSegmentTargetSize 256
36 : #define GinPostingListSegmentMinSize 128
37 :
38 : /*
39 : * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
40 : * long segment. This is used when estimating how much space is required
41 : * for N items, at minimum.
42 : */
43 : #define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
44 :
45 : /*
46 : * A working struct for manipulating a posting tree leaf page.
47 : */
48 : typedef struct
49 : {
50 : dlist_head segments; /* a list of leafSegmentInfos */
51 :
52 : /*
53 : * The following fields represent how the segments are split across pages,
54 : * if a page split is required. Filled in by leafRepackItems.
55 : */
56 : dlist_node *lastleft; /* last segment on left page */
57 : int lsize; /* total size on left page */
58 : int rsize; /* total size on right page */
59 :
60 : bool oldformat; /* page is in pre-9.4 format on disk */
61 :
62 : /*
63 : * If we need WAL data representing the reconstructed leaf page, it's
64 : * stored here by computeLeafRecompressWALData.
65 : */
66 : char *walinfo; /* buffer start */
67 : int walinfolen; /* and length */
68 : } disassembledLeaf;
69 :
70 : typedef struct
71 : {
72 : dlist_node node; /* linked list pointers */
73 :
74 : /*-------------
75 : * 'action' indicates the status of this in-memory segment, compared to
76 : * what's on disk. It is one of the GIN_SEGMENT_* action codes:
77 : *
78 : * UNMODIFIED no changes
79 : * DELETE the segment is to be removed. 'seg' and 'items' are
80 : * ignored
81 : * INSERT this is a completely new segment
82 : * REPLACE this replaces an existing segment with new content
83 : * ADDITEMS like REPLACE, but no items have been removed, and we track
84 : * in detail what items have been added to this segment, in
85 : * 'modifieditems'
86 : *-------------
87 : */
88 : char action;
89 :
90 : ItemPointerData *modifieditems;
91 : uint16 nmodifieditems;
92 :
93 : /*
94 : * The following fields represent the items in this segment. If 'items' is
95 : * not NULL, it contains a palloc'd array of the items in this segment. If
96 : * 'seg' is not NULL, it contains the items in an already-compressed
97 : * format. It can point to an on-disk page (!modified), or a palloc'd
98 : * segment in memory. If both are set, they must represent the same items.
99 : */
100 : GinPostingList *seg;
101 : ItemPointer items;
102 : int nitems; /* # of items in 'items', if items != NULL */
103 : } leafSegmentInfo;
104 :
105 : static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
106 : static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
107 : GinBtreeStack *stack,
108 : void *insertdata, BlockNumber updateblkno,
109 : Page *newlpage, Page *newrpage);
110 :
111 : static disassembledLeaf *disassembleLeaf(Page page);
112 : static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
113 : static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
114 : int nNewItems);
115 :
116 : static void computeLeafRecompressWALData(disassembledLeaf *leaf);
117 : static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
118 : static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
119 : ItemPointerData lbound, ItemPointerData rbound,
120 : Page lpage, Page rpage);
121 :
122 : /*
123 : * Read TIDs from leaf data page to single uncompressed array. The TIDs are
124 : * returned in ascending order.
125 : *
126 : * advancePast is a hint, indicating that the caller is only interested in
127 : * TIDs > advancePast. To return all items, use ItemPointerSetMin.
128 : *
129 : * Note: This function can still return items smaller than advancePast that
130 : * are in the same posting list as the items of interest, so the caller must
131 : * still check all the returned items. But passing it allows this function to
132 : * skip whole posting lists.
133 : */
134 : ItemPointer
3357 heikki.linnakangas 135 CBC 82 : GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
136 : {
137 : ItemPointer result;
138 :
3364 139 82 : if (GinPageIsCompressed(page))
140 : {
3357 141 82 : GinPostingList *seg = GinDataLeafPageGetPostingList(page);
3364 142 82 : Size len = GinDataLeafPageGetPostingListSize(page);
3357 143 82 : Pointer endptr = ((Pointer) seg) + len;
144 : GinPostingList *next;
145 :
146 : /* Skip to the segment containing advancePast+1 */
147 82 : if (ItemPointerIsValid(&advancePast))
148 : {
149 47 : next = GinNextPostingListSegment(seg);
150 89 : while ((Pointer) next < endptr &&
151 39 : ginCompareItemPointers(&next->first, &advancePast) <= 0)
152 : {
153 3 : seg = next;
154 3 : next = GinNextPostingListSegment(seg);
155 : }
156 47 : len = endptr - (Pointer) seg;
157 : }
158 :
159 82 : if (len > 0)
160 76 : result = ginPostingListDecodeAllSegments(seg, len, nitems);
161 : else
162 : {
163 6 : result = NULL;
164 6 : *nitems = 0;
165 : }
166 : }
167 : else
168 : {
3364 heikki.linnakangas 169 UBC 0 : ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
170 :
171 0 : result = palloc((*nitems) * sizeof(ItemPointerData));
172 0 : memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
173 : }
174 :
3364 heikki.linnakangas 175 CBC 82 : return result;
176 : }
177 :
178 : /*
179 : * Places all TIDs from leaf data page to bitmap.
180 : */
181 : int
182 15 : GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
183 : {
184 : ItemPointer uncompressed;
185 : int nitems;
186 :
187 15 : if (GinPageIsCompressed(page))
188 : {
189 15 : GinPostingList *segment = GinDataLeafPageGetPostingList(page);
190 15 : Size len = GinDataLeafPageGetPostingListSize(page);
191 :
192 15 : nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
193 : }
194 : else
195 : {
3364 heikki.linnakangas 196 UBC 0 : uncompressed = dataLeafPageGetUncompressed(page, &nitems);
197 :
198 0 : if (nitems > 0)
199 0 : tbm_add_tuples(tbm, uncompressed, nitems, false);
200 : }
201 :
3364 heikki.linnakangas 202 CBC 15 : return nitems;
203 : }
204 :
205 : /*
206 : * Get pointer to the uncompressed array of items on a pre-9.4 format
207 : * uncompressed leaf page. The number of items in the array is returned in
208 : * *nitems.
209 : */
210 : static ItemPointer
3364 heikki.linnakangas 211 UBC 0 : dataLeafPageGetUncompressed(Page page, int *nitems)
212 : {
213 : ItemPointer items;
214 :
215 0 : Assert(!GinPageIsCompressed(page));
216 :
217 : /*
218 : * In the old pre-9.4 page format, the whole page content is used for
219 : * uncompressed items, and the number of items is stored in 'maxoff'
220 : */
221 0 : items = (ItemPointer) GinDataPageGetData(page);
222 0 : *nitems = GinPageGetOpaque(page)->maxoff;
223 :
224 0 : return items;
225 : }
226 :
227 : /*
228 : * Check if we should follow the right link to find the item we're searching
229 : * for.
230 : *
231 : * Compares inserting item pointer with the right bound of the current page.
232 : */
233 : static bool
6031 bruce 234 CBC 13308 : dataIsMoveRight(GinBtree btree, Page page)
235 : {
236 13308 : ItemPointer iptr = GinDataPageGetRightBound(page);
237 :
238 13308 : if (GinPageRightMost(page))
2062 peter_e 239 13308 : return false;
240 :
1237 akorotkov 241 UBC 0 : if (GinPageIsDeleted(page))
242 0 : return true;
243 :
578 michael 244 0 : return (ginCompareItemPointers(&btree->itemptr, iptr) > 0);
245 : }
246 :
247 : /*
248 : * Find correct PostingItem in non-leaf page. It is assumed that this is
249 : * the correct page, and the searched value SHOULD be on the page.
250 : */
251 : static BlockNumber
6031 bruce 252 CBC 13346 : dataLocateItem(GinBtree btree, GinBtreeStack *stack)
253 : {
254 : OffsetNumber low,
255 : high,
256 : maxoff;
257 13346 : PostingItem *pitem = NULL;
258 : int result;
2545 kgrittn 259 13346 : Page page = BufferGetPage(stack->buffer);
260 :
6031 bruce 261 13346 : Assert(!GinPageIsLeaf(page));
262 13346 : Assert(GinPageIsData(page));
263 :
264 13346 : if (btree->fullScan)
265 : {
6186 teodor 266 38 : stack->off = FirstOffsetNumber;
267 38 : stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
3427 heikki.linnakangas 268 38 : return btree->getLeftMostChild(btree, page);
269 : }
270 :
6186 teodor 271 13308 : low = FirstOffsetNumber;
6031 bruce 272 13308 : maxoff = high = GinPageGetOpaque(page)->maxoff;
273 13308 : Assert(high >= low);
274 :
6186 teodor 275 13308 : high++;
276 :
6031 bruce 277 39924 : while (high > low)
278 : {
6186 teodor 279 26616 : OffsetNumber mid = low + ((high - low) / 2);
280 :
3475 heikki.linnakangas 281 26616 : pitem = GinDataPageGetPostingItem(page, mid);
282 :
6031 bruce 283 26616 : if (mid == maxoff)
284 : {
285 : /*
286 : * Right infinity, page already correctly chosen with a help of
287 : * dataIsMoveRight
288 : */
6186 teodor 289 13308 : result = -1;
290 : }
291 : else
292 : {
3475 heikki.linnakangas 293 13308 : pitem = GinDataPageGetPostingItem(page, mid);
3420 294 13308 : result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
295 : }
296 :
6031 bruce 297 26616 : if (result == 0)
298 : {
6186 teodor 299 UBC 0 : stack->off = mid;
300 0 : return PostingItemGetBlockNumber(pitem);
301 : }
6031 bruce 302 CBC 26616 : else if (result > 0)
6186 teodor 303 13308 : low = mid + 1;
304 : else
305 13308 : high = mid;
306 : }
307 :
6031 bruce 308 13308 : Assert(high >= FirstOffsetNumber && high <= maxoff);
309 :
6186 teodor 310 13308 : stack->off = high;
3475 heikki.linnakangas 311 13308 : pitem = GinDataPageGetPostingItem(page, high);
6186 teodor 312 13308 : return PostingItemGetBlockNumber(pitem);
313 : }
314 :
315 : /*
316 : * Find link to blkno on non-leaf page, returns offset of PostingItem
317 : */
318 : static OffsetNumber
6031 bruce 319 23 : dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
320 : {
321 : OffsetNumber i,
322 23 : maxoff = GinPageGetOpaque(page)->maxoff;
323 : PostingItem *pitem;
324 :
325 23 : Assert(!GinPageIsLeaf(page));
326 23 : Assert(GinPageIsData(page));
327 :
328 : /* if page isn't changed, we return storedOff */
329 23 : if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
330 : {
3475 heikki.linnakangas 331 23 : pitem = GinDataPageGetPostingItem(page, storedOff);
6031 bruce 332 23 : if (PostingItemGetBlockNumber(pitem) == blkno)
6186 teodor 333 23 : return storedOff;
334 :
335 : /*
336 : * we hope, that needed pointer goes to right. It's true if there
337 : * wasn't a deletion
338 : */
6031 bruce 339 UBC 0 : for (i = storedOff + 1; i <= maxoff; i++)
340 : {
3475 heikki.linnakangas 341 0 : pitem = GinDataPageGetPostingItem(page, i);
6031 bruce 342 0 : if (PostingItemGetBlockNumber(pitem) == blkno)
6186 teodor 343 0 : return i;
344 : }
345 :
6031 bruce 346 0 : maxoff = storedOff - 1;
347 : }
348 :
349 : /* last chance */
350 0 : for (i = FirstOffsetNumber; i <= maxoff; i++)
351 : {
3475 heikki.linnakangas 352 0 : pitem = GinDataPageGetPostingItem(page, i);
6031 bruce 353 0 : if (PostingItemGetBlockNumber(pitem) == blkno)
6186 teodor 354 0 : return i;
355 : }
356 :
357 0 : return InvalidOffsetNumber;
358 : }
359 :
360 : /*
361 : * Return blkno of leftmost child
362 : */
363 : static BlockNumber
6031 bruce 364 CBC 38 : dataGetLeftMostPage(GinBtree btree, Page page)
365 : {
366 : PostingItem *pitem;
367 :
368 38 : Assert(!GinPageIsLeaf(page));
369 38 : Assert(GinPageIsData(page));
370 38 : Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
371 :
3475 heikki.linnakangas 372 38 : pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
6186 teodor 373 38 : return PostingItemGetBlockNumber(pitem);
374 : }
375 :
376 : /*
377 : * Add PostingItem to a non-leaf page.
378 : */
379 : void
3475 heikki.linnakangas 380 130 : GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
381 : {
382 130 : OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
383 : char *ptr;
384 :
3420 385 130 : Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
3475 386 130 : Assert(!GinPageIsLeaf(page));
387 :
6031 bruce 388 130 : if (offset == InvalidOffsetNumber)
389 : {
3475 heikki.linnakangas 390 104 : ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
391 : }
392 : else
393 : {
394 26 : ptr = (char *) GinDataPageGetPostingItem(page, offset);
3364 395 26 : if (offset != maxoff + 1)
3475 396 26 : memmove(ptr + sizeof(PostingItem),
397 : ptr,
2118 tgl 398 26 : (maxoff - offset + 1) * sizeof(PostingItem));
399 : }
3475 heikki.linnakangas 400 130 : memcpy(ptr, data, sizeof(PostingItem));
401 :
3282 402 130 : maxoff++;
403 130 : GinPageGetOpaque(page)->maxoff = maxoff;
404 :
405 : /*
406 : * Also set pd_lower to the end of the posting items, to follow the
407 : * "standard" page layout, so that we can squeeze out the unused space
408 : * from full-page images.
409 : */
410 130 : GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
6186 teodor 411 130 : }
412 :
413 : /*
414 : * Delete posting item from non-leaf page
415 : */
416 : void
4557 tgl 417 6 : GinPageDeletePostingItem(Page page, OffsetNumber offset)
418 : {
6031 bruce 419 6 : OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
420 :
421 6 : Assert(!GinPageIsLeaf(page));
422 6 : Assert(offset >= FirstOffsetNumber && offset <= maxoff);
423 :
424 6 : if (offset != maxoff)
3475 heikki.linnakangas 425 6 : memmove(GinDataPageGetPostingItem(page, offset),
426 6 : GinDataPageGetPostingItem(page, offset + 1),
6031 bruce 427 6 : sizeof(PostingItem) * (maxoff - offset));
428 :
3282 heikki.linnakangas 429 6 : maxoff--;
430 6 : GinPageGetOpaque(page)->maxoff = maxoff;
431 :
432 6 : GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
6186 teodor 433 6 : }
434 :
435 : /*
436 : * Prepare to insert data on a leaf data page.
437 : *
438 : * If it will fit, return GPTP_INSERT after doing whatever setup is needed
439 : * before we enter the insertion critical section. *ptp_workspace can be
440 : * set to pass information along to the execPlaceToPage function.
441 : *
442 : * If it won't fit, perform a page split and return two temporary page
443 : * images into *newlpage and *newrpage, with result GPTP_SPLIT.
444 : *
445 : * In neither case should the given page buffer be modified here.
446 : */
447 : static GinPlaceToPageRC
2545 tgl 448 24772 : dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
449 : void *insertdata,
450 : void **ptp_workspace,
451 : Page *newlpage, Page *newrpage)
452 : {
3364 heikki.linnakangas 453 24772 : GinBtreeDataLeafInsertData *items = insertdata;
454 24772 : ItemPointer newItems = &items->items[items->curitem];
455 24772 : int maxitems = items->nitem - items->curitem;
2545 kgrittn 456 24772 : Page page = BufferGetPage(buf);
457 : int i;
458 : ItemPointerData rbound;
459 : ItemPointerData lbound;
460 : bool needsplit;
461 : bool append;
462 : int segsize;
463 : Size freespace;
464 : disassembledLeaf *leaf;
465 : leafSegmentInfo *lastleftinfo;
466 : ItemPointerData maxOldItem;
467 : ItemPointerData remaining;
468 :
3260 bruce 469 24772 : rbound = *GinDataPageGetRightBound(page);
470 :
471 : /*
472 : * Count how many of the new items belong to this page.
473 : */
3364 heikki.linnakangas 474 24772 : if (!GinPageRightMost(page))
475 : {
3364 heikki.linnakangas 476 UBC 0 : for (i = 0; i < maxitems; i++)
477 : {
478 0 : if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
479 : {
480 : /*
481 : * This needs to go to some other location in the tree. (The
482 : * caller should've chosen the insert location so that at
483 : * least the first item goes here.)
484 : */
485 0 : Assert(i > 0);
486 0 : break;
487 : }
488 : }
489 0 : maxitems = i;
490 : }
491 :
492 : /* Disassemble the data on the page */
3364 heikki.linnakangas 493 CBC 24772 : leaf = disassembleLeaf(page);
494 :
495 : /*
496 : * Are we appending to the end of the page? IOW, are all the new items
497 : * larger than any of the existing items.
498 : */
499 24772 : if (!dlist_is_empty(&leaf->segments))
500 : {
501 24772 : lastleftinfo = dlist_container(leafSegmentInfo, node,
502 : dlist_tail_node(&leaf->segments));
503 24772 : if (!lastleftinfo->items)
504 24772 : lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
505 : &lastleftinfo->nitems);
506 24772 : maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
507 24772 : if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
508 24772 : append = true;
509 : else
3364 heikki.linnakangas 510 UBC 0 : append = false;
511 : }
512 : else
513 : {
514 0 : ItemPointerSetMin(&maxOldItem);
515 0 : append = true;
516 : }
517 :
518 : /*
519 : * If we're appending to the end of the page, we will append as many items
520 : * as we can fit (after splitting), and stop when the pages becomes full.
521 : * Otherwise we have to limit the number of new items to insert, because
522 : * once we start packing we can't just stop when we run out of space,
523 : * because we must make sure that all the old items still fit.
524 : */
3364 heikki.linnakangas 525 CBC 24772 : if (GinPageIsCompressed(page))
526 24772 : freespace = GinDataLeafPageGetFreeSpace(page);
527 : else
3364 heikki.linnakangas 528 UBC 0 : freespace = 0;
3364 heikki.linnakangas 529 CBC 24772 : if (append)
530 : {
531 : /*
532 : * Even when appending, trying to append more items than will fit is
533 : * not completely free, because we will merge the new items and old
534 : * items into an array below. In the best case, every new item fits in
535 : * a single byte, and we can use all the free space on the old page as
536 : * well as the new page. For simplicity, ignore segment overhead etc.
537 : */
3282 538 24772 : maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
539 : }
540 : else
541 : {
542 : /*
543 : * Calculate a conservative estimate of how many new items we can fit
544 : * on the two pages after splitting.
545 : *
546 : * We can use any remaining free space on the old page to store full
547 : * segments, as well as the new page. Each full-sized segment can hold
548 : * at least MinTuplesPerSegment items
549 : */
550 : int nnewsegments;
551 :
3364 heikki.linnakangas 552 UBC 0 : nnewsegments = freespace / GinPostingListSegmentMaxSize;
3282 553 0 : nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
3364 554 0 : maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
555 : }
556 :
557 : /* Add the new items to the segment list */
3364 heikki.linnakangas 558 CBC 24772 : if (!addItemsToLeaf(leaf, newItems, maxitems))
559 : {
560 : /* all items were duplicates, we have nothing to do */
3364 heikki.linnakangas 561 UBC 0 : items->curitem += maxitems;
562 :
2545 tgl 563 0 : return GPTP_NO_WORK;
564 : }
565 :
566 : /*
567 : * Pack the items back to compressed segments, ready for writing to disk.
568 : */
3364 heikki.linnakangas 569 CBC 24772 : needsplit = leafRepackItems(leaf, &remaining);
570 :
571 : /*
572 : * Did all the new items fit?
573 : *
574 : * If we're appending, it's OK if they didn't. But as a sanity check,
575 : * verify that all the old items fit.
576 : */
577 24772 : if (ItemPointerIsValid(&remaining))
578 : {
579 20 : if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
3364 heikki.linnakangas 580 UBC 0 : elog(ERROR, "could not split GIN page; all old items didn't fit");
581 :
582 : /* Count how many of the new items did fit. */
3364 heikki.linnakangas 583 CBC 152421 : for (i = 0; i < maxitems; i++)
584 : {
585 152421 : if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
586 20 : break;
587 : }
588 20 : if (i == 0)
3364 heikki.linnakangas 589 UBC 0 : elog(ERROR, "could not split GIN page; no new items fit");
3364 heikki.linnakangas 590 CBC 20 : maxitems = i;
591 : }
592 :
593 24772 : if (!needsplit)
594 : {
595 : /*
596 : * Great, all the items fit on a single page. If needed, prepare data
597 : * for a WAL record describing the changes we'll make.
598 : */
1467 599 24697 : if (RelationNeedsWAL(btree->index) && !btree->isBuild)
2545 tgl 600 24697 : computeLeafRecompressWALData(leaf);
601 :
602 : /*
603 : * We're ready to enter the critical section, but
604 : * dataExecPlaceToPageLeaf will need access to the "leaf" data.
605 : */
606 24697 : *ptp_workspace = leaf;
607 :
3364 heikki.linnakangas 608 24697 : if (append)
609 24697 : elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
610 : maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
611 : items->nitem - items->curitem - maxitems);
612 : else
3364 heikki.linnakangas 613 UBC 0 : elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
614 : maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
615 : items->nitem - items->curitem - maxitems);
616 : }
617 : else
618 : {
619 : /*
620 : * Have to split.
621 : *
622 : * leafRepackItems already divided the segments between the left and
623 : * the right page. It filled the left page as full as possible, and
624 : * put the rest to the right page. When building a new index, that's
625 : * good, because the table is scanned from beginning to end and there
626 : * won't be any more insertions to the left page during the build.
627 : * This packs the index as tight as possible. But otherwise, split
628 : * 50/50, by moving segments from the left page to the right page
629 : * until they're balanced.
630 : *
631 : * As a further heuristic, when appending items to the end of the
632 : * page, try to make the left page 75% full, on the assumption that
633 : * subsequent insertions will probably also go to the end. This packs
634 : * the index somewhat tighter when appending to a table, which is very
635 : * common.
636 : */
3364 heikki.linnakangas 637 CBC 75 : if (!btree->isBuild)
638 : {
639 130 : while (dlist_has_prev(&leaf->segments, leaf->lastleft))
640 : {
641 130 : lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
642 :
643 : /* ignore deleted segments */
3145 644 130 : if (lastleftinfo->action != GIN_SEGMENT_DELETE)
645 : {
646 130 : segsize = SizeOfGinPostingList(lastleftinfo->seg);
647 :
648 : /*
649 : * Note that we check that the right page doesn't become
650 : * more full than the left page even when appending. It's
651 : * possible that we added enough items to make both pages
652 : * more than 75% full.
653 : */
3131 654 130 : if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
655 24 : break;
3145 656 106 : if (append)
657 : {
3131 658 106 : if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4)
3145 659 5 : break;
660 : }
661 :
662 101 : leaf->lsize -= segsize;
663 101 : leaf->rsize += segsize;
664 : }
3364 665 101 : leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
666 : }
667 : }
3282 668 75 : Assert(leaf->lsize <= GinDataPageMaxDataSize);
669 75 : Assert(leaf->rsize <= GinDataPageMaxDataSize);
670 :
671 : /*
672 : * Fetch the max item in the left page's last segment; it becomes the
673 : * right bound of the page.
674 : */
3364 675 75 : lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
676 75 : if (!lastleftinfo->items)
677 75 : lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
678 : &lastleftinfo->nitems);
679 75 : lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
680 :
681 : /*
682 : * Now allocate a couple of temporary page images, and fill them.
683 : */
2545 tgl 684 75 : *newlpage = palloc(BLCKSZ);
685 75 : *newrpage = palloc(BLCKSZ);
686 :
687 75 : dataPlaceToPageLeafSplit(leaf, lbound, rbound,
688 : *newlpage, *newrpage);
689 :
3364 heikki.linnakangas 690 75 : Assert(GinPageRightMost(page) ||
691 : ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
692 : GinDataPageGetRightBound(*newrpage)) < 0);
693 :
694 75 : if (append)
695 75 : elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
696 : maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
697 : items->nitem - items->curitem - maxitems);
698 : else
3364 heikki.linnakangas 699 UBC 0 : elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
700 : maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
701 : items->nitem - items->curitem - maxitems);
702 : }
703 :
3364 heikki.linnakangas 704 CBC 24772 : items->curitem += maxitems;
705 :
2545 tgl 706 24772 : return needsplit ? GPTP_SPLIT : GPTP_INSERT;
707 : }
708 :
709 : /*
710 : * Perform data insertion after beginPlaceToPage has decided it will fit.
711 : *
712 : * This is invoked within a critical section, and XLOG record creation (if
713 : * needed) is already started. The target buffer is registered in slot 0.
714 : */
715 : static void
716 24697 : dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
717 : void *insertdata, void *ptp_workspace)
718 : {
719 24697 : disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace;
720 :
721 : /* Apply changes to page */
722 24697 : dataPlaceToPageLeafRecompress(buf, leaf);
723 :
724 : /* If needed, register WAL data built by computeLeafRecompressWALData */
1467 heikki.linnakangas 725 24697 : if (RelationNeedsWAL(btree->index) && !btree->isBuild)
726 : {
2545 tgl 727 24697 : XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
728 : }
3364 heikki.linnakangas 729 24697 : }
730 :
731 : /*
732 : * Vacuum a posting tree leaf page.
733 : */
734 : void
735 21 : ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
736 : {
2545 kgrittn 737 21 : Page page = BufferGetPage(buffer);
738 : disassembledLeaf *leaf;
3364 heikki.linnakangas 739 21 : bool removedsomething = false;
740 : dlist_iter iter;
741 :
742 21 : leaf = disassembleLeaf(page);
743 :
744 : /* Vacuum each segment. */
745 471 : dlist_foreach(iter, &leaf->segments)
746 : {
747 450 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
748 : int oldsegsize;
749 : ItemPointer cleaned;
750 : int ncleaned;
751 :
752 450 : if (!seginfo->items)
753 450 : seginfo->items = ginPostingListDecode(seginfo->seg,
754 : &seginfo->nitems);
755 450 : if (seginfo->seg)
756 450 : oldsegsize = SizeOfGinPostingList(seginfo->seg);
757 : else
3282 heikki.linnakangas 758 UBC 0 : oldsegsize = GinDataPageMaxDataSize;
759 :
3364 heikki.linnakangas 760 CBC 450 : cleaned = ginVacuumItemPointers(gvs,
761 : seginfo->items,
762 : seginfo->nitems,
763 : &ncleaned);
764 450 : pfree(seginfo->items);
765 450 : seginfo->items = NULL;
766 450 : seginfo->nitems = 0;
767 450 : if (cleaned)
768 : {
769 441 : if (ncleaned > 0)
770 : {
771 : int npacked;
772 :
773 9 : seginfo->seg = ginCompressPostingList(cleaned,
774 : ncleaned,
775 : oldsegsize,
776 : &npacked);
777 : /* Removing an item never increases the size of the segment */
778 9 : if (npacked != ncleaned)
3364 heikki.linnakangas 779 UBC 0 : elog(ERROR, "could not fit vacuumed posting list");
3296 heikki.linnakangas 780 CBC 9 : seginfo->action = GIN_SEGMENT_REPLACE;
781 : }
782 : else
783 : {
3364 784 432 : seginfo->seg = NULL;
785 432 : seginfo->items = NULL;
3296 786 432 : seginfo->action = GIN_SEGMENT_DELETE;
787 : }
3364 788 441 : seginfo->nitems = ncleaned;
789 :
790 441 : removedsomething = true;
791 : }
792 : }
793 :
794 : /*
795 : * If we removed any items, reconstruct the page from the pieces.
796 : *
797 : * We don't try to re-encode the segments here, even though some of them
798 : * might be really small now that we've removed some items from them. It
799 : * seems like a waste of effort, as there isn't really any benefit from
800 : * larger segments per se; larger segments only help to pack more items in
801 : * the same space. We might as well delay doing that until the next
802 : * insertion, which will need to re-encode at least part of the page
803 : * anyway.
804 : *
805 : * Also note if the page was in uncompressed, pre-9.4 format before, it is
806 : * now represented as one huge segment that contains all the items. It
807 : * might make sense to split that, to speed up random access, but we don't
808 : * bother. You'll have to REINDEX anyway if you want the full gain of the
809 : * new tighter index format.
810 : */
811 21 : if (removedsomething)
812 : {
813 : bool modified;
814 :
815 : /*
816 : * Make sure we have a palloc'd copy of all segments, after the first
817 : * segment that is modified. (dataPlaceToPageLeafRecompress requires
818 : * this).
819 : */
3296 820 21 : modified = false;
821 471 : dlist_foreach(iter, &leaf->segments)
822 : {
823 450 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
824 : iter.cur);
825 :
826 450 : if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
827 441 : modified = true;
828 450 : if (modified && seginfo->action != GIN_SEGMENT_DELETE)
829 : {
830 18 : int segsize = SizeOfGinPostingList(seginfo->seg);
831 18 : GinPostingList *tmp = (GinPostingList *) palloc(segsize);
832 :
833 18 : memcpy(tmp, seginfo->seg, segsize);
834 18 : seginfo->seg = tmp;
835 : }
836 : }
837 :
838 21 : if (RelationNeedsWAL(indexrel))
2545 tgl 839 3 : computeLeafRecompressWALData(leaf);
840 :
841 : /* Apply changes to page */
3364 heikki.linnakangas 842 21 : START_CRIT_SECTION();
843 :
3296 844 21 : dataPlaceToPageLeafRecompress(buffer, leaf);
845 :
3364 846 21 : MarkBufferDirty(buffer);
847 :
848 21 : if (RelationNeedsWAL(indexrel))
849 : {
850 : XLogRecPtr recptr;
851 :
2545 tgl 852 3 : XLogBeginInsert();
853 3 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
854 3 : XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen);
3062 heikki.linnakangas 855 3 : recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE);
3364 856 3 : PageSetLSN(page, recptr);
857 : }
858 :
859 21 : END_CRIT_SECTION();
860 : }
861 21 : }
862 :
863 : /*
864 : * Construct a ginxlogRecompressDataLeaf record representing the changes
865 : * in *leaf. (Because this requires a palloc, we have to do it before
866 : * we enter the critical section that actually updates the page.)
867 : */
868 : static void
2545 tgl 869 24700 : computeLeafRecompressWALData(disassembledLeaf *leaf)
870 : {
3296 heikki.linnakangas 871 24700 : int nmodified = 0;
872 : char *walbufbegin;
873 : char *walbufend;
874 : dlist_iter iter;
875 : int segno;
876 : ginxlogRecompressDataLeaf *recompress_xlog;
877 :
878 : /* Count the modified segments */
879 445907 : dlist_foreach(iter, &leaf->segments)
880 : {
881 421207 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
882 : iter.cur);
883 :
884 421207 : if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
885 24733 : nmodified++;
886 : }
887 :
888 : walbufbegin =
3062 889 24700 : palloc(sizeof(ginxlogRecompressDataLeaf) +
890 : BLCKSZ + /* max size needed to hold the segment data */
891 24700 : nmodified * 2 /* (segno + action) per action */
892 : );
3296 893 24700 : walbufend = walbufbegin;
894 :
895 24700 : recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
896 24700 : walbufend += sizeof(ginxlogRecompressDataLeaf);
897 :
898 24700 : recompress_xlog->nactions = nmodified;
899 :
900 24700 : segno = 0;
901 445907 : dlist_foreach(iter, &leaf->segments)
902 : {
903 421207 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
904 : iter.cur);
905 421207 : int segsize = 0;
906 : int datalen;
907 421207 : uint8 action = seginfo->action;
908 :
909 421207 : if (action == GIN_SEGMENT_UNMODIFIED)
910 : {
911 396474 : segno++;
912 396474 : continue;
913 : }
914 :
915 24733 : if (action != GIN_SEGMENT_DELETE)
916 24721 : segsize = SizeOfGinPostingList(seginfo->seg);
917 :
918 : /*
919 : * If storing the uncompressed list of added item pointers would take
920 : * more space than storing the compressed segment as is, do that
921 : * instead.
922 : */
923 24733 : if (action == GIN_SEGMENT_ADDITEMS &&
924 24580 : seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
925 : {
3296 heikki.linnakangas 926 UBC 0 : action = GIN_SEGMENT_REPLACE;
927 : }
928 :
3296 heikki.linnakangas 929 CBC 24733 : *((uint8 *) (walbufend++)) = segno;
930 24733 : *(walbufend++) = action;
931 :
932 24733 : switch (action)
933 : {
934 12 : case GIN_SEGMENT_DELETE:
935 12 : datalen = 0;
936 12 : break;
937 :
938 24580 : case GIN_SEGMENT_ADDITEMS:
939 24580 : datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
940 24580 : memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
941 24580 : memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
942 24580 : datalen += sizeof(uint16);
943 24580 : break;
944 :
945 141 : case GIN_SEGMENT_INSERT:
946 : case GIN_SEGMENT_REPLACE:
947 141 : datalen = SHORTALIGN(segsize);
948 141 : memcpy(walbufend, seginfo->seg, segsize);
949 141 : break;
950 :
3296 heikki.linnakangas 951 UBC 0 : default:
952 0 : elog(ERROR, "unexpected GIN leaf action %d", action);
953 : }
3296 heikki.linnakangas 954 CBC 24733 : walbufend += datalen;
955 :
956 24733 : if (action != GIN_SEGMENT_INSERT)
957 24601 : segno++;
958 : }
959 :
960 : /* Pass back the constructed info via *leaf */
2545 tgl 961 24700 : leaf->walinfo = walbufbegin;
962 24700 : leaf->walinfolen = walbufend - walbufbegin;
3296 heikki.linnakangas 963 24700 : }
964 :
965 : /*
966 : * Assemble a disassembled posting tree leaf page back to a buffer.
967 : *
968 : * This just updates the target buffer; WAL stuff is caller's responsibility.
969 : *
970 : * NOTE: The segment pointers must not point directly to the same buffer,
971 : * except for segments that have not been modified and whose preceding
972 : * segments have not been modified either.
973 : */
974 : static void
975 24718 : dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
976 : {
2545 kgrittn 977 24718 : Page page = BufferGetPage(buf);
978 : char *ptr;
979 : int newsize;
3364 heikki.linnakangas 980 24718 : bool modified = false;
981 : dlist_iter iter;
982 : int segsize;
983 :
984 : /*
985 : * If the page was in pre-9.4 format before, convert the header, and force
986 : * all segments to be copied to the page whether they were modified or
987 : * not.
988 : */
3296 989 24718 : if (!GinPageIsCompressed(page))
990 : {
3296 heikki.linnakangas 991 UBC 0 : Assert(leaf->oldformat);
992 0 : GinPageSetCompressed(page);
993 0 : GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
994 0 : modified = true;
995 : }
996 :
3364 heikki.linnakangas 997 CBC 24718 : ptr = (char *) GinDataLeafPageGetPostingList(page);
998 24718 : newsize = 0;
999 446348 : dlist_foreach(iter, &leaf->segments)
1000 : {
1001 421630 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
1002 :
3296 1003 421630 : if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
3364 1004 25156 : modified = true;
1005 :
3296 1006 421630 : if (seginfo->action != GIN_SEGMENT_DELETE)
1007 : {
1008 421198 : segsize = SizeOfGinPostingList(seginfo->seg);
1009 :
1010 421198 : if (modified)
1011 24733 : memcpy(ptr, seginfo->seg, segsize);
1012 :
1013 421198 : ptr += segsize;
1014 421198 : newsize += segsize;
1015 : }
1016 : }
1017 :
3282 1018 24718 : Assert(newsize <= GinDataPageMaxDataSize);
1019 24718 : GinDataPageSetDataSize(page, newsize);
6186 teodor 1020 24718 : }
1021 :
1022 : /*
1023 : * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
1024 : * segments to two pages instead of one.
1025 : *
1026 : * This is different from the non-split cases in that this does not modify
1027 : * the original page directly, but writes to temporary in-memory copies of
1028 : * the new left and right pages.
1029 : */
1030 : static void
2545 tgl 1031 75 : dataPlaceToPageLeafSplit(disassembledLeaf *leaf,
1032 : ItemPointerData lbound, ItemPointerData rbound,
1033 : Page lpage, Page rpage)
1034 : {
1035 : char *ptr;
1036 : int segsize;
1037 : int lsize;
1038 : int rsize;
1039 : dlist_node *node;
1040 : dlist_node *firstright;
1041 : leafSegmentInfo *seginfo;
1042 :
1043 : /* Initialize temporary pages to hold the new left and right pages */
3364 heikki.linnakangas 1044 75 : GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1045 75 : GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1046 :
1047 : /*
1048 : * Copy the segments that go to the left page.
1049 : *
1050 : * XXX: We should skip copying the unmodified part of the left page, like
1051 : * we do when recompressing.
1052 : */
1053 75 : lsize = 0;
1054 75 : ptr = (char *) GinDataLeafPageGetPostingList(lpage);
1055 75 : firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
1056 75 : for (node = dlist_head_node(&leaf->segments);
1057 1793 : node != firstright;
1058 1718 : node = dlist_next_node(&leaf->segments, node))
1059 : {
1060 1718 : seginfo = dlist_container(leafSegmentInfo, node, node);
1061 :
3296 1062 1718 : if (seginfo->action != GIN_SEGMENT_DELETE)
1063 : {
1064 1718 : segsize = SizeOfGinPostingList(seginfo->seg);
1065 1718 : memcpy(ptr, seginfo->seg, segsize);
1066 1718 : ptr += segsize;
1067 1718 : lsize += segsize;
1068 : }
1069 : }
3364 1070 75 : Assert(lsize == leaf->lsize);
3282 1071 75 : GinDataPageSetDataSize(lpage, lsize);
3364 1072 75 : *GinDataPageGetRightBound(lpage) = lbound;
1073 :
1074 : /* Copy the segments that go to the right page */
1075 75 : ptr = (char *) GinDataLeafPageGetPostingList(rpage);
1076 75 : rsize = 0;
1077 75 : for (node = firstright;
1078 : ;
1079 972 : node = dlist_next_node(&leaf->segments, node))
1080 : {
1081 1047 : seginfo = dlist_container(leafSegmentInfo, node, node);
1082 :
3296 1083 1047 : if (seginfo->action != GIN_SEGMENT_DELETE)
1084 : {
1085 1047 : segsize = SizeOfGinPostingList(seginfo->seg);
1086 1047 : memcpy(ptr, seginfo->seg, segsize);
1087 1047 : ptr += segsize;
1088 1047 : rsize += segsize;
1089 : }
1090 :
3364 1091 1047 : if (!dlist_has_next(&leaf->segments, node))
1092 75 : break;
1093 : }
1094 75 : Assert(rsize == leaf->rsize);
3282 1095 75 : GinDataPageSetDataSize(rpage, rsize);
3364 1096 75 : *GinDataPageGetRightBound(rpage) = rbound;
1097 75 : }
1098 :
1099 : /*
1100 : * Prepare to insert data on an internal data page.
1101 : *
1102 : * If it will fit, return GPTP_INSERT after doing whatever setup is needed
1103 : * before we enter the insertion critical section. *ptp_workspace can be
1104 : * set to pass information along to the execPlaceToPage function.
1105 : *
1106 : * If it won't fit, perform a page split and return two temporary page
1107 : * images into *newlpage and *newrpage, with result GPTP_SPLIT.
1108 : *
1109 : * In neither case should the given page buffer be modified here.
1110 : *
1111 : * Note: on insertion to an internal node, in addition to inserting the given
1112 : * item, the downlink of the existing item at stack->off will be updated to
1113 : * point to updateblkno.
1114 : */
1115 : static GinPlaceToPageRC
2545 tgl 1116 23 : dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1117 : void *insertdata, BlockNumber updateblkno,
1118 : void **ptp_workspace,
1119 : Page *newlpage, Page *newrpage)
1120 : {
kgrittn 1121 23 : Page page = BufferGetPage(buf);
1122 :
1123 : /* If it doesn't fit, deal with split case */
3364 heikki.linnakangas 1124 23 : if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
1125 : {
3364 heikki.linnakangas 1126 UBC 0 : dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
1127 : newlpage, newrpage);
2545 tgl 1128 0 : return GPTP_SPLIT;
1129 : }
1130 :
1131 : /* Else, we're ready to proceed with insertion */
2545 tgl 1132 CBC 23 : return GPTP_INSERT;
1133 : }
1134 :
1135 : /*
1136 : * Perform data insertion after beginPlaceToPage has decided it will fit.
1137 : *
1138 : * This is invoked within a critical section, and XLOG record creation (if
1139 : * needed) is already started. The target buffer is registered in slot 0.
1140 : */
1141 : static void
1142 23 : dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1143 : void *insertdata, BlockNumber updateblkno,
1144 : void *ptp_workspace)
1145 : {
1146 23 : Page page = BufferGetPage(buf);
1147 23 : OffsetNumber off = stack->off;
1148 : PostingItem *pitem;
1149 :
1150 : /* Update existing downlink to point to next page (on internal page) */
3364 heikki.linnakangas 1151 23 : pitem = GinDataPageGetPostingItem(page, off);
1152 23 : PostingItemSetBlockNumber(pitem, updateblkno);
1153 :
1154 : /* Add new item */
1155 23 : pitem = (PostingItem *) insertdata;
1156 23 : GinDataPageAddPostingItem(page, pitem, off);
1157 :
1467 1158 23 : if (RelationNeedsWAL(btree->index) && !btree->isBuild)
1159 : {
1160 : /*
1161 : * This must be static, because it has to survive until XLogInsert,
1162 : * and we can't palloc here. Ugly, but the XLogInsert infrastructure
1163 : * isn't reentrant anyway.
1164 : */
1165 : static ginxlogInsertDataInternal data;
1166 :
3062 1167 9 : data.offset = off;
1168 9 : data.newitem = *pitem;
1169 :
1170 9 : XLogRegisterBufData(0, (char *) &data,
1171 : sizeof(ginxlogInsertDataInternal));
1172 : }
3364 1173 23 : }
1174 :
1175 : /*
1176 : * Prepare to insert data on a posting-tree data page.
1177 : *
1178 : * If it will fit, return GPTP_INSERT after doing whatever setup is needed
1179 : * before we enter the insertion critical section. *ptp_workspace can be
1180 : * set to pass information along to the execPlaceToPage function.
1181 : *
1182 : * If it won't fit, perform a page split and return two temporary page
1183 : * images into *newlpage and *newrpage, with result GPTP_SPLIT.
1184 : *
1185 : * In neither case should the given page buffer be modified here.
1186 : *
1187 : * Note: on insertion to an internal node, in addition to inserting the given
1188 : * item, the downlink of the existing item at stack->off will be updated to
1189 : * point to updateblkno.
1190 : *
1191 : * Calls relevant function for internal or leaf page because they are handled
1192 : * very differently.
1193 : */
1194 : static GinPlaceToPageRC
2545 tgl 1195 24795 : dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1196 : void *insertdata, BlockNumber updateblkno,
1197 : void **ptp_workspace,
1198 : Page *newlpage, Page *newrpage)
1199 : {
kgrittn 1200 24795 : Page page = BufferGetPage(buf);
1201 :
3364 heikki.linnakangas 1202 24795 : Assert(GinPageIsData(page));
1203 :
1204 24795 : if (GinPageIsLeaf(page))
2545 tgl 1205 24772 : return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata,
1206 : ptp_workspace,
1207 : newlpage, newrpage);
1208 : else
1209 23 : return dataBeginPlaceToPageInternal(btree, buf, stack,
1210 : insertdata, updateblkno,
1211 : ptp_workspace,
1212 : newlpage, newrpage);
1213 : }
1214 :
1215 : /*
1216 : * Perform data insertion after beginPlaceToPage has decided it will fit.
1217 : *
1218 : * This is invoked within a critical section, and XLOG record creation (if
1219 : * needed) is already started. The target buffer is registered in slot 0.
1220 : *
1221 : * Calls relevant function for internal or leaf page because they are handled
1222 : * very differently.
1223 : */
1224 : static void
1225 24720 : dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1226 : void *insertdata, BlockNumber updateblkno,
1227 : void *ptp_workspace)
1228 : {
1229 24720 : Page page = BufferGetPage(buf);
1230 :
1231 24720 : if (GinPageIsLeaf(page))
1232 24697 : dataExecPlaceToPageLeaf(btree, buf, stack, insertdata,
1233 : ptp_workspace);
1234 : else
1235 23 : dataExecPlaceToPageInternal(btree, buf, stack, insertdata,
1236 : updateblkno, ptp_workspace);
3364 heikki.linnakangas 1237 24720 : }
1238 :
1239 : /*
1240 : * Split internal page and insert new data.
1241 : *
1242 : * Returns new temp pages to *newlpage and *newrpage.
1243 : * The original buffer is left untouched.
1244 : */
1245 : static void
3364 heikki.linnakangas 1246 UBC 0 : dataSplitPageInternal(GinBtree btree, Buffer origbuf,
1247 : GinBtreeStack *stack,
1248 : void *insertdata, BlockNumber updateblkno,
1249 : Page *newlpage, Page *newrpage)
1250 : {
2545 kgrittn 1251 0 : Page oldpage = BufferGetPage(origbuf);
3364 heikki.linnakangas 1252 0 : OffsetNumber off = stack->off;
1253 0 : int nitems = GinPageGetOpaque(oldpage)->maxoff;
1254 : int nleftitems;
1255 : int nrightitems;
1256 0 : Size pageSize = PageGetPageSize(oldpage);
1257 0 : ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
1258 : ItemPointer bound;
1259 : Page lpage;
1260 : Page rpage;
1261 : OffsetNumber separator;
1262 : PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
1263 :
1264 0 : lpage = PageGetTempPage(oldpage);
1265 0 : rpage = PageGetTempPage(oldpage);
1266 0 : GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1267 0 : GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1268 :
1269 : /*
1270 : * First construct a new list of PostingItems, which includes all the old
1271 : * items, and the new item.
1272 : */
1273 0 : memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
1274 0 : (off - 1) * sizeof(PostingItem));
1275 :
1276 0 : allitems[off - 1] = *((PostingItem *) insertdata);
1277 0 : memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
1278 0 : (nitems - (off - 1)) * sizeof(PostingItem));
1279 0 : nitems++;
1280 :
1281 : /* Update existing downlink to point to next page */
1282 0 : PostingItemSetBlockNumber(&allitems[off], updateblkno);
1283 :
1284 : /*
1285 : * When creating a new index, fit as many tuples as possible on the left
1286 : * page, on the assumption that the table is scanned from beginning to
1287 : * end. This packs the index as tight as possible.
1288 : */
1289 0 : if (btree->isBuild && GinPageRightMost(oldpage))
1290 0 : separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
1291 : else
1292 0 : separator = nitems / 2;
3282 1293 0 : nleftitems = separator;
1294 0 : nrightitems = nitems - separator;
1295 :
1296 0 : memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
1297 : allitems,
1298 : nleftitems * sizeof(PostingItem));
1299 0 : GinPageGetOpaque(lpage)->maxoff = nleftitems;
3364 1300 0 : memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
3282 1301 0 : &allitems[separator],
1302 : nrightitems * sizeof(PostingItem));
1303 0 : GinPageGetOpaque(rpage)->maxoff = nrightitems;
1304 :
1305 : /*
1306 : * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
1307 : */
1308 0 : GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
1309 0 : GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
1310 :
1311 : /* set up right bound for left page */
6186 teodor 1312 0 : bound = GinDataPageGetRightBound(lpage);
3282 heikki.linnakangas 1313 0 : *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
1314 :
1315 : /* set up right bound for right page */
3364 1316 0 : *GinDataPageGetRightBound(rpage) = oldbound;
1317 :
1318 : /* return temp pages to caller */
1319 0 : *newlpage = lpage;
1320 0 : *newrpage = rpage;
3427 1321 0 : }
1322 :
1323 : /*
1324 : * Construct insertion payload for inserting the downlink for given buffer.
1325 : */
1326 : static void *
3427 heikki.linnakangas 1327 CBC 23 : dataPrepareDownlink(GinBtree btree, Buffer lbuf)
1328 : {
3420 1329 23 : PostingItem *pitem = palloc(sizeof(PostingItem));
2545 kgrittn 1330 23 : Page lpage = BufferGetPage(lbuf);
1331 :
3420 heikki.linnakangas 1332 23 : PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
1333 23 : pitem->key = *GinDataPageGetRightBound(lpage);
1334 :
1335 23 : return pitem;
1336 : }
1337 :
1338 : /*
1339 : * Fills new root by right bound values from child.
1340 : * Also called from ginxlog, should not use btree
1341 : */
1342 : void
1343 52 : ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
1344 : {
1345 : PostingItem li,
1346 : ri;
1347 :
6186 teodor 1348 52 : li.key = *GinDataPageGetRightBound(lpage);
3420 heikki.linnakangas 1349 52 : PostingItemSetBlockNumber(&li, lblkno);
1350 52 : GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
1351 :
6186 teodor 1352 52 : ri.key = *GinDataPageGetRightBound(rpage);
3420 heikki.linnakangas 1353 52 : PostingItemSetBlockNumber(&ri, rblkno);
1354 52 : GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
6186 teodor 1355 52 : }
1356 :
1357 :
1358 : /*** Functions to work with disassembled leaf pages ***/
1359 :
1360 : /*
1361 : * Disassemble page into a disassembledLeaf struct.
1362 : */
1363 : static disassembledLeaf *
3364 heikki.linnakangas 1364 24793 : disassembleLeaf(Page page)
1365 : {
1366 : disassembledLeaf *leaf;
1367 : GinPostingList *seg;
1368 : Pointer segbegin;
1369 : Pointer segend;
1370 :
1371 24793 : leaf = palloc0(sizeof(disassembledLeaf));
1372 24793 : dlist_init(&leaf->segments);
1373 :
1374 24793 : if (GinPageIsCompressed(page))
1375 : {
1376 : /*
1377 : * Create a leafSegmentInfo entry for each segment.
1378 : */
1379 24793 : seg = GinDataLeafPageGetPostingList(page);
1380 24793 : segbegin = (Pointer) seg;
1381 24793 : segend = segbegin + GinDataLeafPageGetPostingListSize(page);
1382 448115 : while ((Pointer) seg < segend)
1383 : {
1384 423322 : leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
1385 :
3296 1386 423322 : seginfo->action = GIN_SEGMENT_UNMODIFIED;
3364 1387 423322 : seginfo->seg = seg;
1388 423322 : seginfo->items = NULL;
1389 423322 : seginfo->nitems = 0;
1390 423322 : dlist_push_tail(&leaf->segments, &seginfo->node);
1391 :
1392 423322 : seg = GinNextPostingListSegment(seg);
1393 : }
3296 1394 24793 : leaf->oldformat = false;
1395 : }
1396 : else
1397 : {
1398 : /*
1399 : * A pre-9.4 format uncompressed page is represented by a single
1400 : * segment, with an array of items. The corner case is uncompressed
1401 : * page containing no items, which is represented as no segments.
1402 : */
1403 : ItemPointer uncompressed;
1404 : int nuncompressed;
1405 : leafSegmentInfo *seginfo;
1406 :
3364 heikki.linnakangas 1407 UBC 0 : uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
1408 :
1725 akorotkov 1409 0 : if (nuncompressed > 0)
1410 : {
1411 0 : seginfo = palloc(sizeof(leafSegmentInfo));
1412 :
1413 0 : seginfo->action = GIN_SEGMENT_REPLACE;
1414 0 : seginfo->seg = NULL;
1415 0 : seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
1416 0 : memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
1417 0 : seginfo->nitems = nuncompressed;
1418 :
1419 0 : dlist_push_tail(&leaf->segments, &seginfo->node);
1420 : }
1421 :
3296 heikki.linnakangas 1422 0 : leaf->oldformat = true;
1423 : }
1424 :
3364 heikki.linnakangas 1425 CBC 24793 : return leaf;
1426 : }
1427 :
1428 : /*
1429 : * Distribute newItems to the segments.
1430 : *
1431 : * Any segments that acquire new items are decoded, and the new items are
1432 : * merged with the old items.
1433 : *
1434 : * Returns true if any new items were added. False means they were all
1435 : * duplicates of existing items on the page.
1436 : */
1437 : static bool
1438 24772 : addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
1439 : {
1440 : dlist_iter iter;
1441 24772 : ItemPointer nextnew = newItems;
1442 24772 : int newleft = nNewItems;
1443 24772 : bool modified = false;
1444 : leafSegmentInfo *newseg;
1445 :
1446 : /*
1447 : * If the page is completely empty, just construct one new segment to hold
1448 : * all the new items.
1449 : */
1450 24772 : if (dlist_is_empty(&leaf->segments))
1451 : {
3296 heikki.linnakangas 1452 UBC 0 : newseg = palloc(sizeof(leafSegmentInfo));
1453 0 : newseg->seg = NULL;
1454 0 : newseg->items = newItems;
1455 0 : newseg->nitems = nNewItems;
1456 0 : newseg->action = GIN_SEGMENT_INSERT;
1457 0 : dlist_push_tail(&leaf->segments, &newseg->node);
3364 1458 0 : return true;
1459 : }
1460 :
3364 heikki.linnakangas 1461 CBC 422872 : dlist_foreach(iter, &leaf->segments)
1462 : {
3260 bruce 1463 422872 : leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
1464 : int nthis;
1465 : ItemPointer tmpitems;
1466 : int ntmpitems;
1467 :
1468 : /*
1469 : * How many of the new items fall into this segment?
1470 : */
3364 heikki.linnakangas 1471 422872 : if (!dlist_has_next(&leaf->segments, iter.cur))
1472 24772 : nthis = newleft;
1473 : else
1474 : {
1475 : leafSegmentInfo *next;
1476 : ItemPointerData next_first;
1477 :
1478 398100 : next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
1479 : dlist_next_node(&leaf->segments, iter.cur));
1480 398100 : if (next->items)
1481 24766 : next_first = next->items[0];
1482 : else
1483 : {
1484 373334 : Assert(next->seg != NULL);
1485 373334 : next_first = next->seg->first;
1486 : }
1487 :
1488 398100 : nthis = 0;
1489 398100 : while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
3364 heikki.linnakangas 1490 UBC 0 : nthis++;
1491 : }
3364 heikki.linnakangas 1492 CBC 422872 : if (nthis == 0)
1493 398100 : continue;
1494 :
1495 : /* Merge the new items with the existing items. */
1496 24772 : if (!cur->items)
3364 heikki.linnakangas 1497 UBC 0 : cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
1498 :
1499 : /*
1500 : * Fast path for the important special case that we're appending to
1501 : * the end of the page: don't let the last segment on the page grow
1502 : * larger than the target, create a new segment before that happens.
1503 : */
3296 heikki.linnakangas 1504 CBC 49544 : if (!dlist_has_next(&leaf->segments, iter.cur) &&
1505 24772 : ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
1506 24772 : cur->seg != NULL &&
1507 24772 : SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
1508 : {
1509 184 : newseg = palloc(sizeof(leafSegmentInfo));
1510 184 : newseg->seg = NULL;
1511 184 : newseg->items = nextnew;
1512 184 : newseg->nitems = nthis;
1513 184 : newseg->action = GIN_SEGMENT_INSERT;
1514 184 : dlist_push_tail(&leaf->segments, &newseg->node);
1515 184 : modified = true;
1516 24772 : break;
1517 : }
1518 :
3303 1519 24588 : tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
1520 : nextnew, nthis,
1521 : &ntmpitems);
3364 1522 24588 : if (ntmpitems != cur->nitems)
1523 : {
1524 : /*
1525 : * If there are no duplicates, track the added items so that we
1526 : * can emit a compact ADDITEMS WAL record later on. (it doesn't
1527 : * seem worth re-checking which items were duplicates, if there
1528 : * were any)
1529 : */
3296 1530 24588 : if (ntmpitems == nthis + cur->nitems &&
1531 24588 : cur->action == GIN_SEGMENT_UNMODIFIED)
1532 : {
1533 24588 : cur->action = GIN_SEGMENT_ADDITEMS;
1534 24588 : cur->modifieditems = nextnew;
1535 24588 : cur->nmodifieditems = nthis;
1536 : }
1537 : else
3296 heikki.linnakangas 1538 UBC 0 : cur->action = GIN_SEGMENT_REPLACE;
1539 :
3364 heikki.linnakangas 1540 CBC 24588 : cur->items = tmpitems;
1541 24588 : cur->nitems = ntmpitems;
1542 24588 : cur->seg = NULL;
3296 1543 24588 : modified = true;
1544 : }
1545 :
3364 1546 24588 : nextnew += nthis;
1547 24588 : newleft -= nthis;
1548 24588 : if (newleft == 0)
1549 24588 : break;
1550 : }
1551 :
1552 24772 : return modified;
1553 : }
1554 :
1555 : /*
1556 : * Recompresses all segments that have been modified.
1557 : *
1558 : * If not all the items fit on two pages (ie. after split), we store as
1559 : * many items as fit, and set *remaining to the first item that didn't fit.
1560 : * If all items fit, *remaining is set to invalid.
1561 : *
1562 : * Returns true if the page has to be split.
1563 : */
1564 : static bool
1565 24772 : leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
1566 : {
1567 24772 : int pgused = 0;
3296 1568 24772 : bool needsplit = false;
1569 : dlist_iter iter;
1570 : int segsize;
1571 : leafSegmentInfo *nextseg;
1572 : int npacked;
1573 : bool modified;
1574 : dlist_node *cur_node;
1575 : dlist_node *next_node;
1576 :
3364 1577 24772 : ItemPointerSetInvalid(remaining);
1578 :
1579 : /*
1580 : * cannot use dlist_foreach_modify here because we insert adjacent items
1581 : * while iterating.
1582 : */
3296 1583 24772 : for (cur_node = dlist_head_node(&leaf->segments);
1584 448717 : cur_node != NULL;
1585 423945 : cur_node = next_node)
1586 : {
1587 423965 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1588 : cur_node);
1589 :
1590 423965 : if (dlist_has_next(&leaf->segments, cur_node))
1591 398284 : next_node = dlist_next_node(&leaf->segments, cur_node);
1592 : else
1593 25681 : next_node = NULL;
1594 :
1595 : /* Compress the posting list, if necessary */
1596 423965 : if (seginfo->action != GIN_SEGMENT_DELETE)
1597 : {
1598 423965 : if (seginfo->seg == NULL)
1599 : {
1600 25681 : if (seginfo->nitems > GinPostingListSegmentMaxSize)
3260 bruce 1601 926 : npacked = 0; /* no chance that it would fit. */
1602 : else
1603 : {
3296 heikki.linnakangas 1604 24755 : seginfo->seg = ginCompressPostingList(seginfo->items,
1605 : seginfo->nitems,
1606 : GinPostingListSegmentMaxSize,
1607 : &npacked);
1608 : }
1609 25681 : if (npacked != seginfo->nitems)
1610 : {
1611 : /*
1612 : * Too large. Compress again to the target size, and
1613 : * create a new segment to represent the remaining items.
1614 : * The new segment is inserted after this one, so it will
1615 : * be processed in the next iteration of this loop.
1616 : */
1617 929 : if (seginfo->seg)
1618 3 : pfree(seginfo->seg);
1619 929 : seginfo->seg = ginCompressPostingList(seginfo->items,
1620 : seginfo->nitems,
1621 : GinPostingListSegmentTargetSize,
1622 : &npacked);
1623 929 : if (seginfo->action != GIN_SEGMENT_INSERT)
1624 3 : seginfo->action = GIN_SEGMENT_REPLACE;
1625 :
1626 929 : nextseg = palloc(sizeof(leafSegmentInfo));
1627 929 : nextseg->action = GIN_SEGMENT_INSERT;
1628 929 : nextseg->seg = NULL;
1629 929 : nextseg->items = &seginfo->items[npacked];
1630 929 : nextseg->nitems = seginfo->nitems - npacked;
1631 929 : next_node = &nextseg->node;
1632 929 : dlist_insert_after(cur_node, next_node);
1633 : }
1634 : }
1635 :
1636 : /*
1637 : * If the segment is very small, merge it with the next segment.
1638 : */
1639 423965 : if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
1640 : {
1641 : int nmerged;
1642 :
3296 heikki.linnakangas 1643 UBC 0 : nextseg = dlist_container(leafSegmentInfo, node, next_node);
1644 :
1645 0 : if (seginfo->items == NULL)
1646 0 : seginfo->items = ginPostingListDecode(seginfo->seg,
1647 : &seginfo->nitems);
1648 0 : if (nextseg->items == NULL)
1649 0 : nextseg->items = ginPostingListDecode(nextseg->seg,
1650 : &nextseg->nitems);
1651 0 : nextseg->items =
1652 0 : ginMergeItemPointers(seginfo->items, seginfo->nitems,
1653 0 : nextseg->items, nextseg->nitems,
1654 : &nmerged);
1655 0 : Assert(nmerged == seginfo->nitems + nextseg->nitems);
1656 0 : nextseg->nitems = nmerged;
1657 0 : nextseg->seg = NULL;
1658 :
1659 0 : nextseg->action = GIN_SEGMENT_REPLACE;
1660 0 : nextseg->modifieditems = NULL;
1661 0 : nextseg->nmodifieditems = 0;
1662 :
1663 0 : if (seginfo->action == GIN_SEGMENT_INSERT)
1664 : {
1665 0 : dlist_delete(cur_node);
1666 0 : continue;
1667 : }
1668 : else
1669 : {
1670 0 : seginfo->action = GIN_SEGMENT_DELETE;
1671 0 : seginfo->seg = NULL;
1672 : }
1673 : }
1674 :
3296 heikki.linnakangas 1675 CBC 423965 : seginfo->items = NULL;
1676 423965 : seginfo->nitems = 0;
1677 : }
1678 :
1679 423965 : if (seginfo->action == GIN_SEGMENT_DELETE)
3296 heikki.linnakangas 1680 UBC 0 : continue;
1681 :
1682 : /*
1683 : * OK, we now have a compressed version of this segment ready for
1684 : * copying to the page. Did we exceed the size that fits on one page?
1685 : */
3296 heikki.linnakangas 1686 CBC 423965 : segsize = SizeOfGinPostingList(seginfo->seg);
3282 1687 423965 : if (pgused + segsize > GinDataPageMaxDataSize)
1688 : {
3364 1689 95 : if (!needsplit)
1690 : {
1691 : /* switch to right page */
1692 75 : Assert(pgused > 0);
3296 1693 75 : leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
3364 1694 75 : needsplit = true;
1695 75 : leaf->lsize = pgused;
1696 75 : pgused = 0;
1697 : }
1698 : else
1699 : {
1700 : /*
1701 : * Filled both pages. The last segment we constructed did not
1702 : * fit.
1703 : */
3296 1704 20 : *remaining = seginfo->seg->first;
1705 :
1706 : /*
1707 : * remove all segments that did not fit from the list.
1708 : */
1709 40 : while (dlist_has_next(&leaf->segments, cur_node))
1710 20 : dlist_delete(dlist_next_node(&leaf->segments, cur_node));
1711 20 : dlist_delete(cur_node);
3364 1712 20 : break;
1713 : }
1714 : }
1715 :
1716 423945 : pgused += segsize;
1717 : }
1718 :
1719 24772 : if (!needsplit)
1720 : {
1721 24697 : leaf->lsize = pgused;
1722 24697 : leaf->rsize = 0;
1723 : }
1724 : else
1725 75 : leaf->rsize = pgused;
1726 :
3282 1727 24772 : Assert(leaf->lsize <= GinDataPageMaxDataSize);
1728 24772 : Assert(leaf->rsize <= GinDataPageMaxDataSize);
1729 :
1730 : /*
1731 : * Make a palloc'd copy of every segment after the first modified one,
1732 : * because as we start copying items to the original page, we might
1733 : * overwrite an existing segment.
1734 : */
3296 1735 24772 : modified = false;
1736 448717 : dlist_foreach(iter, &leaf->segments)
1737 : {
1738 423945 : leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1739 : iter.cur);
1740 :
1741 423945 : if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
1742 : {
1743 24772 : modified = true;
1744 : }
1745 399173 : else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
1746 : {
1747 : GinPostingList *tmp;
1748 :
3296 heikki.linnakangas 1749 UBC 0 : segsize = SizeOfGinPostingList(seginfo->seg);
1750 0 : tmp = palloc(segsize);
1751 0 : memcpy(tmp, seginfo->seg, segsize);
1752 0 : seginfo->seg = tmp;
1753 : }
1754 : }
1755 :
3364 heikki.linnakangas 1756 CBC 24772 : return needsplit;
1757 : }
1758 :
1759 :
1760 : /*** Functions that are exported to the rest of the GIN code ***/
1761 :
1762 : /*
1763 : * Creates new posting tree containing the given TIDs. Returns the page
1764 : * number of the root of the new posting tree.
1765 : *
1766 : * items[] must be in sorted order with no duplicates.
1767 : */
1768 : BlockNumber
3441 1769 58 : createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
1770 : GinStatsData *buildStats, Buffer entrybuffer)
1771 : {
1772 : BlockNumber blkno;
1773 : Buffer buffer;
1774 : Page tmppage;
1775 : Page page;
1776 : Pointer ptr;
1777 : int nrootitems;
1778 : int rootsize;
1467 1779 58 : bool is_build = (buildStats != NULL);
1780 :
1781 : /* Construct the new root page in memory first. */
3291 1782 58 : tmppage = (Page) palloc(BLCKSZ);
1783 58 : GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1784 58 : GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
1785 :
1786 : /*
1787 : * Write as many of the items to the root page as fit. In segments of max
1788 : * GinPostingListSegmentMaxSize bytes each.
1789 : */
3364 1790 58 : nrootitems = 0;
1791 58 : rootsize = 0;
3291 1792 58 : ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
3364 1793 1166 : while (nrootitems < nitems)
1794 : {
1795 : GinPostingList *segment;
1796 : int npacked;
1797 : int segsize;
1798 :
1799 1158 : segment = ginCompressPostingList(&items[nrootitems],
1800 1158 : nitems - nrootitems,
1801 : GinPostingListSegmentMaxSize,
1802 : &npacked);
1803 1158 : segsize = SizeOfGinPostingList(segment);
3282 1804 1158 : if (rootsize + segsize > GinDataPageMaxDataSize)
3364 1805 50 : break;
1806 :
1807 1108 : memcpy(ptr, segment, segsize);
1808 1108 : ptr += segsize;
1809 1108 : rootsize += segsize;
1810 1108 : nrootitems += npacked;
1811 1108 : pfree(segment);
1812 : }
3282 1813 58 : GinDataPageSetDataSize(tmppage, rootsize);
1814 :
1815 : /*
1816 : * All set. Get a new physical page, and copy the in-memory page to it.
1817 : */
3291 1818 58 : buffer = GinNewBuffer(index);
2545 kgrittn 1819 58 : page = BufferGetPage(buffer);
3291 heikki.linnakangas 1820 58 : blkno = BufferGetBlockNumber(buffer);
1821 :
1822 : /*
1823 : * Copy any predicate locks from the entry tree leaf (containing posting
1824 : * list) to the posting tree.
1825 : */
1836 teodor 1826 58 : PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
1827 :
3291 heikki.linnakangas 1828 58 : START_CRIT_SECTION();
1829 :
1830 58 : PageRestoreTempPage(tmppage, page);
1831 58 : MarkBufferDirty(buffer);
1832 :
1467 1833 58 : if (RelationNeedsWAL(index) && !is_build)
1834 : {
1835 : XLogRecPtr recptr;
1836 : ginxlogCreatePostingTree data;
1837 :
3364 1838 14 : data.size = rootsize;
1839 :
3062 1840 14 : XLogBeginInsert();
1841 14 : XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree));
1842 :
1843 14 : XLogRegisterData((char *) GinDataLeafPageGetPostingList(page),
1844 : rootsize);
1845 14 : XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
1846 :
1847 14 : recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE);
3441 1848 14 : PageSetLSN(page, recptr);
1849 : }
1850 :
1851 58 : UnlockReleaseBuffer(buffer);
1852 :
1853 58 : END_CRIT_SECTION();
1854 :
1855 : /* During index build, count the newly-added data page */
1856 58 : if (buildStats)
1857 38 : buildStats->nDataPages++;
1858 :
3291 1859 58 : elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
1860 :
1861 : /*
1862 : * Add any remaining TIDs to the newly-created posting tree.
1863 : */
3434 1864 58 : if (nitems > nrootitems)
1865 : {
3427 1866 50 : ginInsertItemPointers(index, blkno,
3434 1867 50 : items + nrootitems,
1868 : nitems - nrootitems,
1869 : buildStats);
1870 : }
1871 :
3441 1872 58 : return blkno;
1873 : }
1874 :
1875 : static void
3420 1876 24792 : ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
1877 : {
6186 teodor 1878 24792 : memset(btree, 0, sizeof(GinBtreeData));
1879 :
1880 24792 : btree->index = index;
3420 heikki.linnakangas 1881 24792 : btree->rootBlkno = rootBlkno;
1882 :
6186 teodor 1883 24792 : btree->findChildPage = dataLocateItem;
3427 heikki.linnakangas 1884 24792 : btree->getLeftMostChild = dataGetLeftMostPage;
4557 tgl 1885 24792 : btree->isMoveRight = dataIsMoveRight;
3364 heikki.linnakangas 1886 24792 : btree->findItem = NULL;
6186 teodor 1887 24792 : btree->findChildPtr = dataFindChildPtr;
2545 tgl 1888 24792 : btree->beginPlaceToPage = dataBeginPlaceToPage;
1889 24792 : btree->execPlaceToPage = dataExecPlaceToPage;
4557 1890 24792 : btree->fillRoot = ginDataFillRoot;
3427 heikki.linnakangas 1891 24792 : btree->prepareDownlink = dataPrepareDownlink;
1892 :
2062 peter_e 1893 24792 : btree->isData = true;
1894 24792 : btree->fullScan = false;
1895 24792 : btree->isBuild = false;
6186 teodor 1896 24792 : }
1897 :
1898 : /*
1899 : * Inserts array of item pointers, may execute several tree scan (very rare)
1900 : */
1901 : void
3427 heikki.linnakangas 1902 24754 : ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
1903 : ItemPointerData *items, uint32 nitem,
1904 : GinStatsData *buildStats)
1905 : {
1906 : GinBtreeData btree;
1907 : GinBtreeDataLeafInsertData insertdata;
1908 : GinBtreeStack *stack;
1909 :
3420 1910 24754 : ginPrepareDataScan(&btree, index, rootBlkno);
3427 1911 24754 : btree.isBuild = (buildStats != NULL);
3420 1912 24754 : insertdata.items = items;
1913 24754 : insertdata.nitem = nitem;
1914 24754 : insertdata.curitem = 0;
1915 :
1916 49526 : while (insertdata.curitem < insertdata.nitem)
1917 : {
1918 : /* search for the leaf page where the first item should go to */
1919 24774 : btree.itemptr = insertdata.items[insertdata.curitem];
1564 akorotkov 1920 24774 : stack = ginFindLeafPage(&btree, false, true, NULL);
1921 :
3364 heikki.linnakangas 1922 24772 : ginInsertValue(&btree, stack, &insertdata, buildStats);
1923 : }
6186 teodor 1924 24752 : }
1925 :
1926 : /*
1927 : * Starts a new scan on a posting tree.
1928 : */
1929 : GinBtreeStack *
2557 kgrittn 1930 38 : ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno,
1931 : Snapshot snapshot)
1932 : {
1933 : GinBtreeStack *stack;
1934 :
3357 heikki.linnakangas 1935 38 : ginPrepareDataScan(btree, index, rootBlkno);
1936 :
2062 peter_e 1937 38 : btree->fullScan = true;
1938 :
1564 akorotkov 1939 38 : stack = ginFindLeafPage(btree, true, false, snapshot);
1940 :
3427 heikki.linnakangas 1941 38 : return stack;
1942 : }
|