Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2023, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "common/hashfn.h"
45 : #include "nodes/bitmapset.h"
46 : #include "nodes/tidbitmap.h"
47 : #include "storage/lwlock.h"
48 : #include "utils/dsa.h"
49 :
50 : /*
51 : * The maximum number of tuples per page is not large (typically 256 with
52 : * 8K pages, or 1024 with 32K pages). So there's not much point in making
53 : * the per-page bitmaps variable size. We just legislate that the size
54 : * is this:
55 : */
56 : #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57 :
58 : /*
59 : * When we have to switch over to lossy storage, we use a data structure
60 : * with one bit per page, where all pages having the same number DIV
61 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62 : * and has the bit set for a given page, there must not be a per-page entry
63 : * for that page in the page table.
64 : *
65 : * We actually store both exact pages and lossy chunks in the same hash
66 : * table, using identical data structures. (This is because the memory
67 : * management for hashtables doesn't easily/efficiently allow space to be
68 : * transferred easily from one hashtable to another.) Therefore it's best
69 : * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71 : * avoid expensive integer remainder operations. So, define it like this:
72 : */
73 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 :
75 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 :
77 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 :
80 : /* number of active words for an exact page: */
81 : #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 : /* number of active words for a lossy chunk: */
83 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 :
85 : /*
86 : * The hashtable entries are represented by this data structure. For
87 : * an exact page, blockno is the page number and bit k of the bitmap
88 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
89 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90 : * bit k represents page blockno+k. Note that it is not possible to
91 : * have exact storage for the first page of a chunk if we are using
92 : * lossy storage for any page in the chunk's range, since the same
93 : * hashtable entry has to serve both purposes.
94 : *
95 : * recheck is used only on exact pages --- it indicates that although
96 : * only the stated tuples need be checked, the full index qual condition
97 : * must be checked for each (ie, these are candidate matches).
98 : */
99 : typedef struct PagetableEntry
100 : {
101 : BlockNumber blockno; /* page number (hashtable key) */
102 : char status; /* hash entry status */
103 : bool ischunk; /* T = lossy storage, F = exact */
104 : bool recheck; /* should the tuples be rechecked? */
105 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 : } PagetableEntry;
107 :
108 : /*
109 : * Holds array of pagetable entries.
110 : */
111 : typedef struct PTEntryArray
112 : {
113 : pg_atomic_uint32 refcount; /* no. of iterator attached */
114 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 : } PTEntryArray;
116 :
117 : /*
118 : * We want to avoid the overhead of creating the hashtable, which is
119 : * comparatively large, when not necessary. Particularly when we are using a
120 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121 : * long enough to accumulate one entry in such cases. We therefore avoid
122 : * creating an actual hashtable until we need two pagetable entries. When
123 : * just one pagetable entry is needed, we store it in a fixed field of
124 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
126 : * when nentries is zero or one.)
127 : */
128 : typedef enum
129 : {
130 : TBM_EMPTY, /* no hashtable, nentries == 0 */
131 : TBM_ONE_PAGE, /* entry1 contains the single entry */
132 : TBM_HASH /* pagetable is valid, entry1 is not */
133 : } TBMStatus;
134 :
135 : /*
136 : * Current iterating state of the TBM.
137 : */
138 : typedef enum
139 : {
140 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142 : TBM_ITERATING_SHARED /* converted to shared page and chunk array */
143 : } TBMIteratingState;
144 :
145 : /*
146 : * Here is the representation for a whole TIDBitMap:
147 : */
148 : struct TIDBitmap
149 : {
150 : NodeTag type; /* to make it a valid Node */
151 : MemoryContext mcxt; /* memory context containing me */
152 : TBMStatus status; /* see codes above */
153 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154 : int nentries; /* number of entries in pagetable */
155 : int maxentries; /* limit on same to meet maxbytes */
156 : int npages; /* number of exact entries in pagetable */
157 : int nchunks; /* number of lossy entries in pagetable */
158 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
159 : uint32 lossify_start; /* offset to start lossifying hashtable at */
160 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161 : /* these are valid when iterating is true: */
162 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
163 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166 : dsa_pointer ptpages; /* dsa_pointer to the page array */
167 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168 : dsa_area *dsa; /* reference to per-query dsa area */
169 : };
170 :
171 : /*
172 : * When iterating over a bitmap in sorted order, a TBMIterator is used to
173 : * track our progress. There can be several iterators scanning the same
174 : * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175 : * any iterator is created.
176 : */
177 : struct TBMIterator
178 : {
179 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180 : int spageptr; /* next spages index */
181 : int schunkptr; /* next schunks index */
182 : int schunkbit; /* next bit to check in current schunk */
183 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184 : };
185 :
186 : /*
187 : * Holds the shared members of the iterator so that multiple processes
188 : * can jointly iterate.
189 : */
190 : typedef struct TBMSharedIteratorState
191 : {
192 : int nentries; /* number of entries in pagetable */
193 : int maxentries; /* limit on same to meet maxbytes */
194 : int npages; /* number of exact entries in pagetable */
195 : int nchunks; /* number of lossy entries in pagetable */
196 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197 : dsa_pointer spages; /* dsa pointer to page array */
198 : dsa_pointer schunks; /* dsa pointer to chunk array */
199 : LWLock lock; /* lock to protect below members */
200 : int spageptr; /* next spages index */
201 : int schunkptr; /* next schunks index */
202 : int schunkbit; /* next bit to check in current schunk */
203 : } TBMSharedIteratorState;
204 :
205 : /*
206 : * pagetable iteration array.
207 : */
208 : typedef struct PTIterationArray
209 : {
210 : pg_atomic_uint32 refcount; /* no. of iterator attached */
211 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
212 : } PTIterationArray;
213 :
214 : /*
215 : * same as TBMIterator, but it is used for joint iteration, therefore this
216 : * also holds a reference to the shared state.
217 : */
218 : struct TBMSharedIterator
219 : {
220 : TBMSharedIteratorState *state; /* shared state */
221 : PTEntryArray *ptbase; /* pagetable element array */
222 : PTIterationArray *ptpages; /* sorted exact page index list */
223 : PTIterationArray *ptchunks; /* sorted lossy page index list */
224 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225 : };
226 :
227 : /* Local function prototypes */
228 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 : const TIDBitmap *b);
231 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 : BlockNumber pageno);
233 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
234 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 : static void tbm_lossify(TIDBitmap *tbm);
237 : static int tbm_comparator(const void *left, const void *right);
238 : static int tbm_shared_comparator(const void *left, const void *right,
239 : void *arg);
240 :
241 : /* define hashtable mapping block numbers to PagetableEntry's */
242 : #define SH_USE_NONDEFAULT_ALLOCATOR
243 : #define SH_PREFIX pagetable
244 : #define SH_ELEMENT_TYPE PagetableEntry
245 : #define SH_KEY_TYPE BlockNumber
246 : #define SH_KEY blockno
247 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 : #define SH_EQUAL(tb, a, b) a == b
249 : #define SH_SCOPE static inline
250 : #define SH_DEFINE
251 : #define SH_DECLARE
252 : #include "lib/simplehash.h"
253 :
254 :
255 : /*
256 : * tbm_create - create an initially-empty bitmap
257 : *
258 : * The bitmap will live in the memory context that is CurrentMemoryContext
259 : * at the time of this call. It will be limited to (approximately) maxbytes
260 : * total memory consumption. If the DSA passed to this function is not NULL
261 : * then the memory for storing elements of the underlying page table will
262 : * be allocated from the DSA.
263 : */
264 : TIDBitmap *
2223 rhaas 265 CBC 8855 : tbm_create(long maxbytes, dsa_area *dsa)
266 : {
267 : TIDBitmap *tbm;
268 :
269 : /* Create the TIDBitmap struct and zero all its fields */
5202 tgl 270 8855 : tbm = makeNode(TIDBitmap);
271 :
6566 272 8855 : tbm->mcxt = CurrentMemoryContext;
6536 273 8855 : tbm->status = TBM_EMPTY;
274 :
1976 rhaas 275 8855 : tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
2368 andres 276 8855 : tbm->lossify_start = 0;
2223 rhaas 277 8855 : tbm->dsa = dsa;
2215 278 8855 : tbm->dsapagetable = InvalidDsaPointer;
279 8855 : tbm->dsapagetableold = InvalidDsaPointer;
280 8855 : tbm->ptpages = InvalidDsaPointer;
281 8855 : tbm->ptchunks = InvalidDsaPointer;
282 :
6536 tgl 283 8855 : return tbm;
284 : }
285 :
286 : /*
287 : * Actually create the hashtable. Since this is a moderately expensive
288 : * proposition, we don't do it until we have to.
289 : */
290 : static void
291 4249 : tbm_create_pagetable(TIDBitmap *tbm)
292 : {
293 4249 : Assert(tbm->status != TBM_HASH);
294 4249 : Assert(tbm->pagetable == NULL);
295 :
2223 rhaas 296 4249 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
297 :
298 : /* If entry1 is valid, push it into the hashtable */
6536 tgl 299 4249 : if (tbm->status == TBM_ONE_PAGE)
300 : {
301 : PagetableEntry *page;
302 : bool found;
303 : char oldstatus;
304 :
2368 andres 305 2965 : page = pagetable_insert(tbm->pagetable,
306 : tbm->entry1.blockno,
307 : &found);
6536 tgl 308 2965 : Assert(!found);
2368 andres 309 2965 : oldstatus = page->status;
6536 tgl 310 2965 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
2368 andres 311 2965 : page->status = oldstatus;
312 : }
313 :
6536 tgl 314 4249 : tbm->status = TBM_HASH;
6566 315 4249 : }
316 :
317 : /*
318 : * tbm_free - free a TIDBitmap
319 : */
320 : void
321 8824 : tbm_free(TIDBitmap *tbm)
322 : {
6536 323 8824 : if (tbm->pagetable)
2368 andres 324 4248 : pagetable_destroy(tbm->pagetable);
6566 tgl 325 8824 : if (tbm->spages)
326 2899 : pfree(tbm->spages);
327 8824 : if (tbm->schunks)
328 1290 : pfree(tbm->schunks);
329 8824 : pfree(tbm);
330 8824 : }
331 :
332 : /*
333 : * tbm_free_shared_area - free shared state
334 : *
335 : * Free shared iterator state, Also free shared pagetable and iterator arrays
336 : * memory if they are not referred by any of the shared iterator i.e recount
337 : * is becomes 0.
338 : */
339 : void
2223 rhaas 340 54 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
341 : {
342 54 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343 : PTEntryArray *ptbase;
344 : PTIterationArray *ptpages;
345 : PTIterationArray *ptchunks;
346 :
2215 347 54 : if (DsaPointerIsValid(istate->pagetable))
348 : {
349 54 : ptbase = dsa_get_address(dsa, istate->pagetable);
350 54 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
351 27 : dsa_free(dsa, istate->pagetable);
352 : }
353 54 : if (DsaPointerIsValid(istate->spages))
354 : {
2223 355 54 : ptpages = dsa_get_address(dsa, istate->spages);
356 54 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
357 27 : dsa_free(dsa, istate->spages);
358 : }
2215 359 54 : if (DsaPointerIsValid(istate->schunks))
360 : {
2223 rhaas 361 UBC 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
362 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
363 0 : dsa_free(dsa, istate->schunks);
364 : }
365 :
2223 rhaas 366 CBC 54 : dsa_free(dsa, dp);
367 54 : }
368 :
369 : /*
370 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
371 : *
372 : * If recheck is true, then the recheck flag will be set in the
373 : * TBMIterateResult when any of these tuples are reported out.
374 : */
375 : void
5477 tgl 376 4227065 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
377 : bool recheck)
378 : {
3005 379 4227065 : BlockNumber currblk = InvalidBlockNumber;
380 4227065 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
381 : int i;
382 :
2223 rhaas 383 4227065 : Assert(tbm->iterating == TBM_NOT_ITERATING);
6566 tgl 384 9544847 : for (i = 0; i < ntids; i++)
385 : {
386 5317782 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
387 5317782 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
388 : int wordnum,
389 : bitnum;
390 :
391 : /* safety check to ensure we don't overrun bit array bounds */
392 5317782 : if (off < 1 || off > MAX_TUPLES_PER_PAGE)
6566 tgl 393 UBC 0 : elog(ERROR, "tuple offset out of range: %u", off);
394 :
395 : /*
396 : * Look up target page unless we already did. This saves cycles when
397 : * the input includes consecutive tuples on the same page, which is
398 : * common enough to justify an extra test here.
399 : */
3005 tgl 400 CBC 5317782 : if (blk != currblk)
401 : {
andres 402 4615418 : if (tbm_page_is_lossy(tbm, blk))
tgl 403 1428 : page = NULL; /* remember page is lossy */
404 : else
405 4613990 : page = tbm_get_pageentry(tbm, blk);
406 4615418 : currblk = blk;
407 : }
408 :
409 5317782 : if (page == NULL)
410 1428 : continue; /* whole page is already marked */
411 :
6566 412 5316354 : if (page->ischunk)
413 : {
414 : /* The page is a lossy chunk header, set bit for itself */
6566 tgl 415 UBC 0 : wordnum = bitnum = 0;
416 : }
417 : else
418 : {
419 : /* Page is exact, so set bit for individual tuple */
6566 tgl 420 CBC 5316354 : wordnum = WORDNUM(off - 1);
421 5316354 : bitnum = BITNUM(off - 1);
422 : }
423 5316354 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
5477 424 5316354 : page->recheck |= recheck;
425 :
6566 426 5316354 : if (tbm->nentries > tbm->maxentries)
427 : {
428 12 : tbm_lossify(tbm);
429 : /* Page could have been converted to lossy, so force new lookup */
3005 430 12 : currblk = InvalidBlockNumber;
431 : }
432 : }
6566 433 4227065 : }
434 :
435 : /*
436 : * tbm_add_page - add a whole page to a TIDBitmap
437 : *
438 : * This causes the whole page to be reported (with the recheck flag)
439 : * when the TIDBitmap is scanned.
440 : */
441 : void
5129 442 67674 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
443 : {
444 : /* Enter the page in the bitmap, or mark it lossy if already present */
445 67674 : tbm_mark_page_lossy(tbm, pageno);
446 : /* If we went over the memory limit, lossify some more pages */
447 67674 : if (tbm->nentries > tbm->maxentries)
5129 tgl 448 UBC 0 : tbm_lossify(tbm);
5129 tgl 449 CBC 67674 : }
450 :
451 : /*
452 : * tbm_union - set union
453 : *
454 : * a is modified in-place, b is not changed
455 : */
456 : void
6566 tgl 457 UBC 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
458 : {
6536 459 0 : Assert(!a->iterating);
460 : /* Nothing to do if b is empty */
461 0 : if (b->nentries == 0)
462 0 : return;
463 : /* Scan through chunks and pages in b, merge into a */
464 0 : if (b->status == TBM_ONE_PAGE)
465 0 : tbm_union_page(a, &b->entry1);
466 : else
467 : {
468 : pagetable_iterator i;
469 : PagetableEntry *bpage;
470 :
471 0 : Assert(b->status == TBM_HASH);
2368 andres 472 0 : pagetable_start_iterate(b->pagetable, &i);
473 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
6536 tgl 474 0 : tbm_union_page(a, bpage);
475 : }
476 : }
477 :
478 : /* Process one page of b during a union op */
479 : static void
480 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
481 : {
482 : PagetableEntry *apage;
483 : int wordnum;
484 :
485 0 : if (bpage->ischunk)
486 : {
487 : /* Scan b's chunk, mark each indicated page lossy in a */
3432 488 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
489 : {
6536 490 0 : bitmapword w = bpage->words[wordnum];
491 :
492 0 : if (w != 0)
493 : {
494 : BlockNumber pg;
495 :
496 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497 0 : while (w != 0)
498 : {
499 0 : if (w & 1)
500 0 : tbm_mark_page_lossy(a, pg);
501 0 : pg++;
502 0 : w >>= 1;
503 : }
504 : }
505 : }
506 : }
507 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
508 : {
509 : /* page is already lossy in a, nothing to do */
510 0 : return;
511 : }
512 : else
513 : {
514 0 : apage = tbm_get_pageentry(a, bpage->blockno);
515 0 : if (apage->ischunk)
516 : {
517 : /* The page is a lossy chunk header, set bit for itself */
518 0 : apage->words[0] |= ((bitmapword) 1 << 0);
519 : }
520 : else
521 : {
522 : /* Both pages are exact, merge at the bit level */
523 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
524 0 : apage->words[wordnum] |= bpage->words[wordnum];
5477 525 0 : apage->recheck |= bpage->recheck;
526 : }
527 : }
528 :
6536 529 0 : if (a->nentries > a->maxentries)
530 0 : tbm_lossify(a);
531 : }
532 :
533 : /*
534 : * tbm_intersect - set intersection
535 : *
536 : * a is modified in-place, b is not changed
537 : */
538 : void
6566 tgl 539 CBC 26 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
540 : {
541 26 : Assert(!a->iterating);
542 : /* Nothing to do if a is empty */
6536 543 26 : if (a->nentries == 0)
6536 tgl 544 UBC 0 : return;
545 : /* Scan through chunks and pages in a, try to match to b */
6536 tgl 546 CBC 26 : if (a->status == TBM_ONE_PAGE)
547 : {
6468 548 12 : if (tbm_intersect_page(a, &a->entry1, b))
549 : {
550 : /* Page is now empty, remove it from a */
6536 tgl 551 UBC 0 : Assert(!a->entry1.ischunk);
552 0 : a->npages--;
553 0 : a->nentries--;
554 0 : Assert(a->nentries == 0);
555 0 : a->status = TBM_EMPTY;
556 : }
557 : }
558 : else
559 : {
560 : pagetable_iterator i;
561 : PagetableEntry *apage;
562 :
6536 tgl 563 CBC 14 : Assert(a->status == TBM_HASH);
2368 andres 564 14 : pagetable_start_iterate(a->pagetable, &i);
565 2441 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566 : {
6468 tgl 567 2413 : if (tbm_intersect_page(a, apage, b))
568 : {
569 : /* Page or chunk is now empty, remove it from a */
6536 570 2321 : if (apage->ischunk)
6536 tgl 571 UBC 0 : a->nchunks--;
572 : else
6536 tgl 573 CBC 2321 : a->npages--;
574 2321 : a->nentries--;
2368 andres 575 2321 : if (!pagetable_delete(a->pagetable, apage->blockno))
6566 tgl 576 UBC 0 : elog(ERROR, "hash table corrupted");
577 : }
578 : }
579 : }
580 : }
581 :
582 : /*
583 : * Process one page of a during an intersection op
584 : *
585 : * Returns true if apage is now empty and should be deleted from a
586 : */
587 : static bool
6468 tgl 588 CBC 2425 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
589 : {
590 : const PagetableEntry *bpage;
591 : int wordnum;
592 :
6536 593 2425 : if (apage->ischunk)
594 : {
595 : /* Scan each bit in chunk, try to clear */
6385 bruce 596 15 : bool candelete = true;
597 :
3432 tgl 598 75 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
599 : {
6536 600 60 : bitmapword w = apage->words[wordnum];
601 :
602 60 : if (w != 0)
603 : {
604 54 : bitmapword neww = w;
605 : BlockNumber pg;
606 : int bitnum;
607 :
608 54 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609 54 : bitnum = 0;
610 3303 : while (w != 0)
611 : {
612 3249 : if (w & 1)
613 : {
614 1680 : if (!tbm_page_is_lossy(b, pg) &&
615 126 : tbm_find_pageentry(b, pg) == NULL)
616 : {
617 : /* Page is not in b at all, lose lossy bit */
6536 tgl 618 UBC 0 : neww &= ~((bitmapword) 1 << bitnum);
619 : }
620 : }
6536 tgl 621 CBC 3249 : pg++;
622 3249 : bitnum++;
623 3249 : w >>= 1;
624 : }
625 54 : apage->words[wordnum] = neww;
626 54 : if (neww != 0)
627 54 : candelete = false;
628 : }
629 : }
630 15 : return candelete;
631 : }
632 2410 : else if (tbm_page_is_lossy(b, apage->blockno))
633 : {
634 : /*
635 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
636 : * because the page 'b' is lossy, we don't know which ones. Therefore
637 : * we mark 'a' as requiring rechecks, to indicate that at most those
638 : * tuples set in 'a' are matches.
639 : */
5477 tgl 640 UBC 0 : apage->recheck = true;
6536 641 0 : return false;
642 : }
643 : else
644 : {
6385 bruce 645 CBC 2410 : bool candelete = true;
646 :
6536 tgl 647 2410 : bpage = tbm_find_pageentry(b, apage->blockno);
648 2410 : if (bpage != NULL)
649 : {
650 : /* Both pages are exact, merge at the bit level */
651 2146 : Assert(!bpage->ischunk);
652 12876 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
653 : {
654 10730 : apage->words[wordnum] &= bpage->words[wordnum];
655 10730 : if (apage->words[wordnum] != 0)
656 90 : candelete = false;
657 : }
5477 658 2146 : apage->recheck |= bpage->recheck;
659 : }
660 : /* If there is no matching b page, we can just delete the a page */
6536 661 2410 : return candelete;
662 : }
663 : }
664 :
665 : /*
666 : * tbm_is_empty - is a TIDBitmap completely empty?
667 : */
668 : bool
6433 669 319 : tbm_is_empty(const TIDBitmap *tbm)
670 : {
671 319 : return (tbm->nentries == 0);
672 : }
673 :
674 : /*
675 : * tbm_begin_iterate - prepare to iterate through a TIDBitmap
676 : *
677 : * The TBMIterator struct is created in the caller's memory context.
678 : * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679 : * okay to just allow the memory context to be released, too. It is caller's
680 : * responsibility not to touch the TBMIterator anymore once the TIDBitmap
681 : * is freed.
682 : *
683 : * NB: after this is called, it is no longer allowed to modify the contents
684 : * of the bitmap. However, you can call this multiple times to scan the
685 : * contents repeatedly, including parallel scans.
686 : */
687 : TBMIterator *
6566 688 17262 : tbm_begin_iterate(TIDBitmap *tbm)
689 : {
690 : TBMIterator *iterator;
691 :
2223 rhaas 692 17262 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
693 :
694 : /*
695 : * Create the TBMIterator struct, with enough trailing space to serve the
696 : * needs of the TBMIterateResult sub-struct.
697 : */
5202 tgl 698 17262 : iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
699 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
700 17262 : iterator->tbm = tbm;
701 :
702 : /*
703 : * Initialize iteration pointers.
704 : */
705 17262 : iterator->spageptr = 0;
706 17262 : iterator->schunkptr = 0;
707 17262 : iterator->schunkbit = 0;
708 :
709 : /*
710 : * If we have a hashtable, create and fill the sorted page lists, unless
711 : * we already did that for a previous iterator. Note that the lists are
712 : * attached to the bitmap not the iterator, so they can be used by more
713 : * than one iterator.
714 : */
2223 rhaas 715 17262 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716 : {
717 : pagetable_iterator i;
718 : PagetableEntry *page;
719 : int npages;
720 : int nchunks;
721 :
5202 tgl 722 4187 : if (!tbm->spages && tbm->npages > 0)
723 2900 : tbm->spages = (PagetableEntry **)
724 2900 : MemoryContextAlloc(tbm->mcxt,
725 2900 : tbm->npages * sizeof(PagetableEntry *));
726 4187 : if (!tbm->schunks && tbm->nchunks > 0)
727 1290 : tbm->schunks = (PagetableEntry **)
728 1290 : MemoryContextAlloc(tbm->mcxt,
729 1290 : tbm->nchunks * sizeof(PagetableEntry *));
730 :
731 4187 : npages = nchunks = 0;
2368 andres 732 4187 : pagetable_start_iterate(tbm->pagetable, &i);
733 184052 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734 : {
5202 tgl 735 179865 : if (page->ischunk)
736 1311 : tbm->schunks[nchunks++] = page;
737 : else
738 178554 : tbm->spages[npages++] = page;
739 : }
740 4187 : Assert(npages == tbm->npages);
741 4187 : Assert(nchunks == tbm->nchunks);
742 4187 : if (npages > 1)
743 2898 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
744 : tbm_comparator);
745 4187 : if (nchunks > 1)
746 6 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
747 : tbm_comparator);
748 : }
749 :
2223 rhaas 750 17262 : tbm->iterating = TBM_ITERATING_PRIVATE;
751 :
5202 tgl 752 17262 : return iterator;
753 : }
754 :
755 : /*
756 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757 : *
758 : * The necessary shared state will be allocated from the DSA passed to
759 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
760 : *
761 : * This will convert the pagetable hash into page and chunk array of the index
762 : * into pagetable array.
763 : */
764 : dsa_pointer
2223 rhaas 765 72 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
766 : {
767 : dsa_pointer dp;
768 : TBMSharedIteratorState *istate;
2215 769 72 : PTEntryArray *ptbase = NULL;
2223 tgl 770 72 : PTIterationArray *ptpages = NULL;
771 72 : PTIterationArray *ptchunks = NULL;
772 :
rhaas 773 72 : Assert(tbm->dsa != NULL);
774 72 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
775 :
776 : /*
777 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
778 : * lock, this will also be used by multiple worker for shared iterate.
779 : */
2215 780 72 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
2223 781 72 : istate = dsa_get_address(tbm->dsa, dp);
782 :
783 : /*
784 : * If we're not already iterating, create and fill the sorted page lists.
785 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
786 : * and we can just reuse them.)
787 : */
788 72 : if (tbm->iterating == TBM_NOT_ITERATING)
789 : {
790 : pagetable_iterator i;
791 : PagetableEntry *page;
792 : int idx;
793 : int npages;
794 : int nchunks;
795 :
796 : /*
797 : * Allocate the page and chunk array memory from the DSA to share
798 : * across multiple processes.
799 : */
800 36 : if (tbm->npages)
801 : {
802 36 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803 : tbm->npages * sizeof(int));
804 36 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805 36 : pg_atomic_init_u32(&ptpages->refcount, 0);
806 : }
807 36 : if (tbm->nchunks)
808 : {
809 3 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810 : tbm->nchunks * sizeof(int));
811 3 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812 3 : pg_atomic_init_u32(&ptchunks->refcount, 0);
813 : }
814 :
815 : /*
816 : * If TBM status is TBM_HASH then iterate over the pagetable and
817 : * convert it to page and chunk arrays. But if it's in the
818 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
819 : * from the DSA.
820 : */
821 36 : npages = nchunks = 0;
822 36 : if (tbm->status == TBM_HASH)
823 : {
824 36 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825 :
826 36 : pagetable_start_iterate(tbm->pagetable, &i);
827 13551 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828 : {
829 13515 : idx = page - ptbase->ptentry;
830 13515 : if (page->ischunk)
831 12 : ptchunks->index[nchunks++] = idx;
832 : else
833 13503 : ptpages->index[npages++] = idx;
834 : }
835 :
836 36 : Assert(npages == tbm->npages);
837 36 : Assert(nchunks == tbm->nchunks);
838 : }
2215 rhaas 839 UBC 0 : else if (tbm->status == TBM_ONE_PAGE)
840 : {
841 : /*
842 : * In one page mode allocate the space for one pagetable entry,
843 : * initialize it, and directly store its index (i.e. 0) in the
844 : * page array.
845 : */
2223 846 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847 : sizeof(PagetableEntry));
848 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
2189 849 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
2223 850 0 : ptpages->index[0] = 0;
851 : }
852 :
2215 rhaas 853 CBC 36 : if (ptbase != NULL)
854 36 : pg_atomic_init_u32(&ptbase->refcount, 0);
2223 855 36 : if (npages > 1)
61 peter 856 GNC 36 : qsort_arg(ptpages->index, npages, sizeof(int),
857 36 : tbm_shared_comparator, ptbase->ptentry);
2223 rhaas 858 CBC 36 : if (nchunks > 1)
61 peter 859 GNC 3 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
860 3 : tbm_shared_comparator, ptbase->ptentry);
861 : }
862 :
863 : /*
864 : * Store the TBM members in the shared state so that we can share them
865 : * across multiple processes.
866 : */
2223 rhaas 867 CBC 72 : istate->nentries = tbm->nentries;
868 72 : istate->maxentries = tbm->maxentries;
869 72 : istate->npages = tbm->npages;
870 72 : istate->nchunks = tbm->nchunks;
871 72 : istate->pagetable = tbm->dsapagetable;
872 72 : istate->spages = tbm->ptpages;
873 72 : istate->schunks = tbm->ptchunks;
874 :
875 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877 72 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878 :
879 : /*
880 : * For every shared iterator, referring to pagetable and iterator array,
881 : * increase the refcount by 1 so that while freeing the shared iterator we
882 : * don't free pagetable and iterator array until its refcount becomes 0.
883 : */
2215 884 72 : if (ptbase != NULL)
885 72 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
886 72 : if (ptpages != NULL)
2223 887 72 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
2215 888 72 : if (ptchunks != NULL)
2223 889 6 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
890 :
891 : /* Initialize the iterator lock */
1059 tgl 892 72 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
893 :
894 : /* Initialize the shared iterator state */
2223 rhaas 895 72 : istate->schunkbit = 0;
896 72 : istate->schunkptr = 0;
897 72 : istate->spageptr = 0;
898 :
899 72 : tbm->iterating = TBM_ITERATING_SHARED;
900 :
901 72 : return dp;
902 : }
903 :
904 : /*
905 : * tbm_extract_page_tuple - extract the tuple offsets from a page
906 : *
907 : * The extracted offsets are stored into TBMIterateResult.
908 : */
909 : static inline int
910 387415 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
911 : {
912 : int wordnum;
913 387415 : int ntuples = 0;
914 :
915 2324490 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
916 : {
917 1937075 : bitmapword w = page->words[wordnum];
918 :
919 1937075 : if (w != 0)
920 : {
921 1053047 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
922 :
923 46497881 : while (w != 0)
924 : {
925 45444834 : if (w & 1)
926 7725402 : output->offsets[ntuples++] = (OffsetNumber) off;
927 45444834 : off++;
928 45444834 : w >>= 1;
929 : }
930 : }
931 : }
932 :
933 387415 : return ntuples;
934 : }
935 :
936 : /*
937 : * tbm_advance_schunkbit - Advance the schunkbit
938 : */
939 : static inline void
940 153408 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
941 : {
942 153408 : int schunkbit = *schunkbitp;
943 :
944 686124 : while (schunkbit < PAGES_PER_CHUNK)
945 : {
946 683478 : int wordnum = WORDNUM(schunkbit);
947 683478 : int bitnum = BITNUM(schunkbit);
948 :
949 683478 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
950 150762 : break;
951 532716 : schunkbit++;
952 : }
953 :
954 153408 : *schunkbitp = schunkbit;
955 153408 : }
956 :
957 : /*
958 : * tbm_iterate - scan through next page of a TIDBitmap
959 : *
960 : * Returns a TBMIterateResult representing one page, or NULL if there are
961 : * no more pages to scan. Pages are guaranteed to be delivered in numerical
962 : * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
963 : * remember the exact tuples to look at on this page --- the caller must
964 : * examine all tuples on the page and check if they meet the intended
965 : * condition. If result->recheck is true, only the indicated tuples need
966 : * be examined, but the condition must be rechecked anyway. (For ease of
967 : * testing, recheck is always set true when ntuples < 0.)
968 : */
969 : TBMIterateResult *
5202 tgl 970 517770 : tbm_iterate(TBMIterator *iterator)
971 : {
5050 bruce 972 517770 : TIDBitmap *tbm = iterator->tbm;
5202 tgl 973 517770 : TBMIterateResult *output = &(iterator->output);
974 :
2223 rhaas 975 517770 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
976 :
977 : /*
978 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
979 : * schunkbit to the next set bit.
980 : */
5202 tgl 981 520392 : while (iterator->schunkptr < tbm->nchunks)
982 : {
983 147258 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984 147258 : int schunkbit = iterator->schunkbit;
985 :
2223 rhaas 986 147258 : tbm_advance_schunkbit(chunk, &schunkbit);
6566 tgl 987 147258 : if (schunkbit < PAGES_PER_CHUNK)
988 : {
5202 989 144636 : iterator->schunkbit = schunkbit;
6566 990 144636 : break;
991 : }
992 : /* advance to next chunk */
5202 993 2622 : iterator->schunkptr++;
994 2622 : iterator->schunkbit = 0;
995 : }
996 :
997 : /*
998 : * If both chunk and per-page data remain, must output the numerically
999 : * earlier page.
1000 : */
1001 517770 : if (iterator->schunkptr < tbm->nchunks)
1002 : {
1003 144636 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004 : BlockNumber chunk_blockno;
1005 :
1006 144636 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1007 144636 : if (iterator->spageptr >= tbm->npages ||
1008 9288 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009 : {
1010 : /* Return a lossy page indicator from the chunk */
6566 1011 141558 : output->blockno = chunk_blockno;
1012 141558 : output->ntuples = -1;
5477 1013 141558 : output->recheck = true;
5202 1014 141558 : iterator->schunkbit++;
6566 1015 141558 : return output;
1016 : }
1017 : }
1018 :
5202 1019 376212 : if (iterator->spageptr < tbm->npages)
1020 : {
1021 : PagetableEntry *page;
1022 : int ntuples;
1023 :
1024 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
6536 1025 360409 : if (tbm->status == TBM_ONE_PAGE)
1026 6909 : page = &tbm->entry1;
1027 : else
5202 1028 353500 : page = tbm->spages[iterator->spageptr];
1029 :
1030 : /* scan bitmap to extract individual offset numbers */
2223 rhaas 1031 360409 : ntuples = tbm_extract_page_tuple(page, output);
1032 360409 : output->blockno = page->blockno;
1033 360409 : output->ntuples = ntuples;
1034 360409 : output->recheck = page->recheck;
1035 360409 : iterator->spageptr++;
1036 360409 : return output;
1037 : }
1038 :
1039 : /* Nothing more in the bitmap */
1040 15803 : return NULL;
1041 : }
1042 :
1043 : /*
1044 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1045 : *
1046 : * As above, but this will iterate using an iterator which is shared
1047 : * across multiple processes. We need to acquire the iterator LWLock,
1048 : * before accessing the shared members.
1049 : */
1050 : TBMIterateResult *
1051 30330 : tbm_shared_iterate(TBMSharedIterator *iterator)
1052 : {
1053 30330 : TBMIterateResult *output = &iterator->output;
1054 30330 : TBMSharedIteratorState *istate = iterator->state;
2215 1055 30330 : PagetableEntry *ptbase = NULL;
1056 30330 : int *idxpages = NULL;
1057 30330 : int *idxchunks = NULL;
1058 :
1059 30330 : if (iterator->ptbase != NULL)
1060 30330 : ptbase = iterator->ptbase->ptentry;
1061 30330 : if (iterator->ptpages != NULL)
1062 30330 : idxpages = iterator->ptpages->index;
1063 30330 : if (iterator->ptchunks != NULL)
1064 7440 : idxchunks = iterator->ptchunks->index;
1065 :
1066 : /* Acquire the LWLock before accessing the shared members */
2223 1067 30330 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068 :
1069 : /*
1070 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1071 : * schunkbit to the next set bit.
1072 : */
1073 30354 : while (istate->schunkptr < istate->nchunks)
1074 : {
1075 6150 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076 6150 : int schunkbit = istate->schunkbit;
1077 :
1078 6150 : tbm_advance_schunkbit(chunk, &schunkbit);
1079 6150 : if (schunkbit < PAGES_PER_CHUNK)
1080 : {
1081 6126 : istate->schunkbit = schunkbit;
1082 6126 : break;
1083 : }
1084 : /* advance to next chunk */
1085 24 : istate->schunkptr++;
1086 24 : istate->schunkbit = 0;
1087 : }
1088 :
1089 : /*
1090 : * If both chunk and per-page data remain, must output the numerically
1091 : * earlier page.
1092 : */
1093 30330 : if (istate->schunkptr < istate->nchunks)
1094 : {
1095 6126 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096 : BlockNumber chunk_blockno;
1097 :
1098 6126 : chunk_blockno = chunk->blockno + istate->schunkbit;
1099 :
1100 6126 : if (istate->spageptr >= istate->npages ||
2222 1101 6126 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102 : {
1103 : /* Return a lossy page indicator from the chunk */
2223 1104 3102 : output->blockno = chunk_blockno;
1105 3102 : output->ntuples = -1;
1106 3102 : output->recheck = true;
1107 3102 : istate->schunkbit++;
1108 :
1109 3102 : LWLockRelease(&istate->lock);
1110 3102 : return output;
1111 : }
1112 : }
1113 :
1114 27228 : if (istate->spageptr < istate->npages)
1115 : {
1116 27006 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117 : int ntuples;
1118 :
1119 : /* scan bitmap to extract individual offset numbers */
1120 27006 : ntuples = tbm_extract_page_tuple(page, output);
6566 tgl 1121 27006 : output->blockno = page->blockno;
1122 27006 : output->ntuples = ntuples;
5477 1123 27006 : output->recheck = page->recheck;
2223 rhaas 1124 27006 : istate->spageptr++;
1125 :
1126 27006 : LWLockRelease(&istate->lock);
1127 :
6566 tgl 1128 27006 : return output;
1129 : }
1130 :
2223 rhaas 1131 222 : LWLockRelease(&istate->lock);
1132 :
1133 : /* Nothing more in the bitmap */
6566 tgl 1134 222 : return NULL;
1135 : }
1136 :
1137 : /*
1138 : * tbm_end_iterate - finish an iteration over a TIDBitmap
1139 : *
1140 : * Currently this is just a pfree, but it might do more someday. (For
1141 : * instance, it could be useful to count open iterators and allow the
1142 : * bitmap to return to read/write status when there are no more iterators.)
1143 : */
1144 : void
5202 1145 17225 : tbm_end_iterate(TBMIterator *iterator)
1146 : {
1147 17225 : pfree(iterator);
1148 17225 : }
1149 :
1150 : /*
1151 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152 : *
1153 : * This doesn't free any of the shared state associated with the iterator,
1154 : * just our backend-private state.
1155 : */
1156 : void
2223 rhaas 1157 348 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1158 : {
1159 348 : pfree(iterator);
1160 348 : }
1161 :
1162 : /*
1163 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1164 : *
1165 : * Returns NULL if there is no non-lossy entry for the pageno.
1166 : */
1167 : static const PagetableEntry *
6566 tgl 1168 2536 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1169 : {
1170 : const PagetableEntry *page;
1171 :
6536 1172 2536 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
6536 tgl 1173 UBC 0 : return NULL;
1174 :
6536 tgl 1175 CBC 2536 : if (tbm->status == TBM_ONE_PAGE)
1176 : {
6536 tgl 1177 UBC 0 : page = &tbm->entry1;
1178 0 : if (page->blockno != pageno)
1179 0 : return NULL;
1180 0 : Assert(!page->ischunk);
1181 0 : return page;
1182 : }
1183 :
2368 andres 1184 CBC 2536 : page = pagetable_lookup(tbm->pagetable, pageno);
6566 tgl 1185 2536 : if (page == NULL)
1186 264 : return NULL;
1187 2272 : if (page->ischunk)
6566 tgl 1188 UBC 0 : return NULL; /* don't want a lossy chunk header */
6566 tgl 1189 CBC 2272 : return page;
1190 : }
1191 :
1192 : /*
1193 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194 : *
1195 : * If new, the entry is marked as an exact (non-chunk) entry.
1196 : *
1197 : * This may cause the table to exceed the desired memory size. It is
1198 : * up to the caller to call tbm_lossify() at the next safe point if so.
1199 : */
1200 : static PagetableEntry *
1201 4613990 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1202 : {
1203 : PagetableEntry *page;
1204 : bool found;
1205 :
6536 1206 4613990 : if (tbm->status == TBM_EMPTY)
1207 : {
1208 : /* Use the fixed slot */
1209 6482 : page = &tbm->entry1;
1210 6482 : found = false;
1211 6482 : tbm->status = TBM_ONE_PAGE;
1212 : }
1213 : else
1214 : {
1215 4607508 : if (tbm->status == TBM_ONE_PAGE)
1216 : {
1217 39191 : page = &tbm->entry1;
1218 39191 : if (page->blockno == pageno)
1219 36226 : return page;
1220 : /* Time to switch from one page to a hashtable */
1221 2965 : tbm_create_pagetable(tbm);
1222 : }
1223 :
1224 : /* Look up or create an entry */
2368 andres 1225 4571282 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1226 : }
1227 :
1228 : /* Initialize it if not present before */
6566 tgl 1229 4577764 : if (!found)
1230 : {
2368 andres 1231 206566 : char oldstatus = page->status;
1232 :
6566 tgl 1233 1445962 : MemSet(page, 0, sizeof(PagetableEntry));
2368 andres 1234 206566 : page->status = oldstatus;
6566 tgl 1235 206566 : page->blockno = pageno;
1236 : /* must count it too */
1237 206566 : tbm->nentries++;
1238 206566 : tbm->npages++;
1239 : }
1240 :
1241 4577764 : return page;
1242 : }
1243 :
1244 : /*
1245 : * tbm_page_is_lossy - is the page marked as lossily stored?
1246 : */
1247 : static bool
1248 4619382 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1249 : {
1250 : PagetableEntry *page;
1251 : BlockNumber chunk_pageno;
1252 : int bitno;
1253 :
1254 : /* we can skip the lookup if there are no lossy chunks */
1255 4619382 : if (tbm->nchunks == 0)
1256 4559067 : return false;
6536 1257 60315 : Assert(tbm->status == TBM_HASH);
1258 :
6566 1259 60315 : bitno = pageno % PAGES_PER_CHUNK;
1260 60315 : chunk_pageno = pageno - bitno;
1261 :
2368 andres 1262 60315 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263 :
6566 tgl 1264 60315 : if (page != NULL && page->ischunk)
1265 : {
6385 bruce 1266 6216 : int wordnum = WORDNUM(bitno);
1267 6216 : int bitnum = BITNUM(bitno);
1268 :
6566 tgl 1269 6216 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1270 2856 : return true;
1271 : }
1272 57459 : return false;
1273 : }
1274 :
1275 : /*
1276 : * tbm_mark_page_lossy - mark the page number as lossily stored
1277 : *
1278 : * This may cause the table to exceed the desired memory size. It is
1279 : * up to the caller to call tbm_lossify() at the next safe point if so.
1280 : */
1281 : static void
1282 73830 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1283 : {
1284 : PagetableEntry *page;
1285 : bool found;
1286 : BlockNumber chunk_pageno;
1287 : int bitno;
1288 : int wordnum;
1289 : int bitnum;
1290 :
1291 : /* We force the bitmap into hashtable mode whenever it's lossy */
6536 1292 73830 : if (tbm->status != TBM_HASH)
1293 1284 : tbm_create_pagetable(tbm);
1294 :
6566 1295 73830 : bitno = pageno % PAGES_PER_CHUNK;
1296 73830 : chunk_pageno = pageno - bitno;
1297 :
1298 : /*
1299 : * Remove any extant non-lossy entry for the page. If the page is its own
1300 : * chunk header, however, we skip this and handle the case below.
1301 : */
1302 73830 : if (bitno != 0)
1303 : {
2368 andres 1304 72732 : if (pagetable_delete(tbm->pagetable, pageno))
1305 : {
1306 : /* It was present, so adjust counts */
6566 tgl 1307 6156 : tbm->nentries--;
1308 6156 : tbm->npages--; /* assume it must have been non-lossy */
1309 : }
1310 : }
1311 :
1312 : /* Look up or create entry for chunk-header page */
2368 andres 1313 73830 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314 :
1315 : /* Initialize it if not present before */
6566 tgl 1316 73830 : if (!found)
1317 : {
2368 andres 1318 1284 : char oldstatus = page->status;
1319 :
6566 tgl 1320 8988 : MemSet(page, 0, sizeof(PagetableEntry));
2368 andres 1321 1284 : page->status = oldstatus;
6566 tgl 1322 1284 : page->blockno = chunk_pageno;
1323 1284 : page->ischunk = true;
1324 : /* must count it too */
1325 1284 : tbm->nentries++;
1326 1284 : tbm->nchunks++;
1327 : }
1328 72546 : else if (!page->ischunk)
1329 : {
2368 andres 1330 51 : char oldstatus = page->status;
1331 :
1332 : /* chunk header page was formerly non-lossy, make it lossy */
6566 tgl 1333 357 : MemSet(page, 0, sizeof(PagetableEntry));
2368 andres 1334 51 : page->status = oldstatus;
6566 tgl 1335 51 : page->blockno = chunk_pageno;
1336 51 : page->ischunk = true;
1337 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1338 51 : page->words[0] = ((bitmapword) 1 << 0);
1339 : /* adjust counts */
1340 51 : tbm->nchunks++;
1341 51 : tbm->npages--;
1342 : }
1343 :
1344 : /* Now set the original target page's bit */
1345 73830 : wordnum = WORDNUM(bitno);
1346 73830 : bitnum = BITNUM(bitno);
1347 73830 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1348 73830 : }
1349 :
1350 : /*
1351 : * tbm_lossify - lose some information to get back under the memory limit
1352 : */
1353 : static void
1354 12 : tbm_lossify(TIDBitmap *tbm)
1355 : {
1356 : pagetable_iterator i;
1357 : PagetableEntry *page;
1358 :
1359 : /*
1360 : * XXX Really stupid implementation: this just lossifies pages in
1361 : * essentially random order. We should be paying some attention to the
1362 : * number of bits set in each page, instead.
1363 : *
1364 : * Since we are called as soon as nentries exceeds maxentries, we should
1365 : * push nentries down to significantly less than maxentries, or else we'll
1366 : * just end up doing this again very soon. We shoot for maxentries/2.
1367 : */
2223 rhaas 1368 12 : Assert(tbm->iterating == TBM_NOT_ITERATING);
6536 tgl 1369 12 : Assert(tbm->status == TBM_HASH);
1370 :
2368 andres 1371 12 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372 6165 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373 : {
6566 tgl 1374 6165 : if (page->ischunk)
6566 tgl 1375 UBC 0 : continue; /* already a chunk header */
1376 :
1377 : /*
1378 : * If the page would become a chunk header, we won't save anything by
1379 : * converting it to lossy, so skip it.
1380 : */
6566 tgl 1381 CBC 6165 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1382 9 : continue;
1383 :
1384 : /* This does the dirty work ... */
1385 6156 : tbm_mark_page_lossy(tbm, page->blockno);
1386 :
4250 1387 6156 : if (tbm->nentries <= tbm->maxentries / 2)
1388 : {
1389 : /*
1390 : * We have made enough room. Remember where to start lossifying
1391 : * next round, so we evenly iterate over the hashtable.
1392 : */
2368 andres 1393 12 : tbm->lossify_start = i.cur;
5827 tgl 1394 12 : break;
1395 : }
1396 :
1397 : /*
1398 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399 : * hashtable and may have deleted the non-lossy chunk. We can
1400 : * continue the same hash table scan, since failure to visit one
1401 : * element or visiting the newly inserted element, isn't fatal.
1402 : */
1403 : }
1404 :
1405 : /*
1406 : * With a big bitmap and small work_mem, it's possible that we cannot get
1407 : * under maxentries. Again, if that happens, we'd end up uselessly
1408 : * calling tbm_lossify over and over. To prevent this from becoming a
1409 : * performance sink, force maxentries up to at least double the current
1410 : * number of entries. (In essence, we're admitting inability to fit
1411 : * within work_mem when we do this.) Note that this test will not fire if
1412 : * we broke out of the loop early; and if we didn't, the current number of
1413 : * entries is simply not reducible any further.
1414 : */
4250 1415 12 : if (tbm->nentries > tbm->maxentries / 2)
4250 tgl 1416 UBC 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
6566 tgl 1417 CBC 12 : }
1418 :
1419 : /*
1420 : * qsort comparator to handle PagetableEntry pointers.
1421 : */
1422 : static int
1423 1418555 : tbm_comparator(const void *left, const void *right)
1424 : {
3955 bruce 1425 1418555 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1426 1418555 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1427 :
6566 tgl 1428 1418555 : if (l < r)
1429 657382 : return -1;
1430 761173 : else if (l > r)
1431 761173 : return 1;
6566 tgl 1432 UBC 0 : return 0;
1433 : }
1434 :
1435 : /*
1436 : * As above, but this will get index into PagetableEntry array. Therefore,
1437 : * it needs to get actual PagetableEntry using the index before comparing the
1438 : * blockno.
1439 : */
1440 : static int
2223 rhaas 1441 CBC 119145 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1442 : {
1443 119145 : PagetableEntry *base = (PagetableEntry *) arg;
1444 119145 : PagetableEntry *lpage = &base[*(int *) left];
1445 119145 : PagetableEntry *rpage = &base[*(int *) right];
1446 :
1447 119145 : if (lpage->blockno < rpage->blockno)
1448 54483 : return -1;
1449 64662 : else if (lpage->blockno > rpage->blockno)
1450 64662 : return 1;
2223 rhaas 1451 UBC 0 : return 0;
1452 : }
1453 :
1454 : /*
1455 : * tbm_attach_shared_iterate
1456 : *
1457 : * Allocate a backend-private iterator and attach the shared iterator state
1458 : * to it so that multiple processed can iterate jointly.
1459 : *
1460 : * We also converts the DSA pointers to local pointers and store them into
1461 : * our private iterator.
1462 : */
1463 : TBMSharedIterator *
2223 rhaas 1464 CBC 348 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1465 : {
1466 : TBMSharedIterator *iterator;
1467 : TBMSharedIteratorState *istate;
1468 :
1469 : /*
1470 : * Create the TBMSharedIterator struct, with enough trailing space to
1471 : * serve the needs of the TBMIterateResult sub-struct.
1472 : */
2215 1473 348 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1474 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475 :
2223 1476 348 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477 :
1478 348 : iterator->state = istate;
1479 :
1480 348 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481 :
1482 348 : if (istate->npages)
1483 348 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 348 : if (istate->nchunks)
1485 30 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486 :
1487 348 : return iterator;
1488 : }
1489 :
1490 : /*
1491 : * pagetable_allocate
1492 : *
1493 : * Callback function for allocating the memory for hashtable elements.
1494 : * Allocate memory for hashtable elements, using DSA if available.
1495 : */
1496 : static inline void *
1497 4652 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1498 : {
1499 4652 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500 : PTEntryArray *ptbase;
1501 :
1502 4652 : if (tbm->dsa == NULL)
1503 4574 : return MemoryContextAllocExtended(pagetable->ctx, size,
1504 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1505 :
1506 : /*
1507 : * Save the dsapagetable reference in dsapagetableold before allocating
1508 : * new memory so that pagetable_free can free the old entry.
1509 : */
1510 78 : tbm->dsapagetableold = tbm->dsapagetable;
2204 1511 78 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512 : sizeof(PTEntryArray) + size,
1513 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
2223 1514 78 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515 :
1516 78 : return ptbase->ptentry;
1517 : }
1518 :
1519 : /*
1520 : * pagetable_free
1521 : *
1522 : * Callback function for freeing hash table elements.
1523 : */
1524 : static inline void
1525 4651 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1526 : {
1527 4651 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528 :
1529 : /* pfree the input pointer if DSA is not available */
1530 4651 : if (tbm->dsa == NULL)
1531 4573 : pfree(pointer);
1532 78 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 : {
1534 42 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1535 42 : tbm->dsapagetableold = InvalidDsaPointer;
1536 : }
1537 4651 : }
1538 :
1539 : /*
1540 : * tbm_calculate_entries
1541 : *
1542 : * Estimate number of hashtable entries we can have within maxbytes.
1543 : */
1544 : long
1976 1545 242553 : tbm_calculate_entries(double maxbytes)
1546 : {
1547 : long nbuckets;
1548 :
1549 : /*
1550 : * Estimate number of hashtable entries we can have within maxbytes. This
1551 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 : * for our purpose. Also count an extra Pointer per entry for the arrays
1553 : * created during iteration readout.
1554 : */
1555 242553 : nbuckets = maxbytes /
1556 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 242553 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558 242553 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1559 :
1560 242553 : return nbuckets;
1561 : }
|