Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dshash.c
4 : * Concurrent hash tables backed by dynamic shared memory areas.
5 : *
6 : * This is an open hashing hash table, with a linked list at each table
7 : * entry. It supports dynamic resizing, as required to prevent the linked
8 : * lists from growing too long on average. Currently, only growing is
9 : * supported: the hash table never becomes smaller.
10 : *
11 : * To deal with concurrency, it has a fixed size set of partitions, each of
12 : * which is independently locked. Each bucket maps to a partition; so insert,
13 : * find and iterate operations normally only acquire one lock. Therefore,
14 : * good concurrency is achieved whenever such operations don't collide at the
15 : * lock partition level. However, when a resize operation begins, all
16 : * partition locks must be acquired simultaneously for a brief period. This
17 : * is only expected to happen a small number of times until a stable size is
18 : * found, since growth is geometric.
19 : *
20 : * Future versions may support iterators and incremental resizing; for now
21 : * the implementation is minimalist.
22 : *
23 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
24 : * Portions Copyright (c) 1994, Regents of the University of California
25 : *
26 : * IDENTIFICATION
27 : * src/backend/lib/dshash.c
28 : *
29 : *-------------------------------------------------------------------------
30 : */
31 :
32 : #include "postgres.h"
33 :
34 : #include "common/hashfn.h"
35 : #include "lib/dshash.h"
36 : #include "storage/ipc.h"
37 : #include "storage/lwlock.h"
38 : #include "utils/dsa.h"
39 : #include "utils/memutils.h"
40 :
41 : /*
42 : * An item in the hash table. This wraps the user's entry object in an
43 : * envelop that holds a pointer back to the bucket and a pointer to the next
44 : * item in the bucket.
45 : */
46 : struct dshash_table_item
47 : {
48 : /* The next item in the same bucket. */
49 : dsa_pointer next;
50 : /* The hashed key, to avoid having to recompute it. */
51 : dshash_hash hash;
52 : /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */
53 : };
54 :
55 : /*
56 : * The number of partitions for locking purposes. This is set to match
57 : * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
58 : * the buffer pool must be good enough for any other purpose. This could
59 : * become a runtime parameter in future.
60 : */
61 : #define DSHASH_NUM_PARTITIONS_LOG2 7
62 : #define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
63 :
64 : /* A magic value used to identify our hash tables. */
65 : #define DSHASH_MAGIC 0x75ff6a20
66 :
67 : /*
68 : * Tracking information for each lock partition. Initially, each partition
69 : * corresponds to one bucket, but each time the hash table grows, the buckets
70 : * covered by each partition split so the number of buckets covered doubles.
71 : *
72 : * We might want to add padding here so that each partition is on a different
73 : * cache line, but doing so would bloat this structure considerably.
74 : */
75 : typedef struct dshash_partition
76 : {
77 : LWLock lock; /* Protects all buckets in this partition. */
78 : size_t count; /* # of items in this partition's buckets */
79 : } dshash_partition;
80 :
81 : /*
82 : * The head object for a hash table. This will be stored in dynamic shared
83 : * memory.
84 : */
85 : typedef struct dshash_table_control
86 : {
87 : dshash_table_handle handle;
88 : uint32 magic;
89 : dshash_partition partitions[DSHASH_NUM_PARTITIONS];
90 : int lwlock_tranche_id;
91 :
92 : /*
93 : * The following members are written to only when ALL partitions locks are
94 : * held. They can be read when any one partition lock is held.
95 : */
96 :
97 : /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
98 : size_t size_log2; /* log2(number of buckets) */
99 : dsa_pointer buckets; /* current bucket array */
100 : } dshash_table_control;
101 :
102 : /*
103 : * Per-backend state for a dynamic hash table.
104 : */
105 : struct dshash_table
106 : {
107 : dsa_area *area; /* Backing dynamic shared memory area. */
108 : dshash_parameters params; /* Parameters. */
109 : void *arg; /* User-supplied data pointer. */
110 : dshash_table_control *control; /* Control object in DSM. */
111 : dsa_pointer *buckets; /* Current bucket pointers in DSM. */
112 : size_t size_log2; /* log2(number of buckets) */
113 : };
114 :
115 : /* Given a pointer to an item, find the entry (user data) it holds. */
116 : #define ENTRY_FROM_ITEM(item) \
117 : ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
118 :
119 : /* Given a pointer to an entry, find the item that holds it. */
120 : #define ITEM_FROM_ENTRY(entry) \
121 : ((dshash_table_item *)((char *)(entry) - \
122 : MAXALIGN(sizeof(dshash_table_item))))
123 :
124 : /* How many resize operations (bucket splits) have there been? */
125 : #define NUM_SPLITS(size_log2) \
126 : (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
127 :
128 : /* How many buckets are there in a given size? */
129 : #define NUM_BUCKETS(size_log2) \
130 : (((size_t) 1) << (size_log2))
131 :
132 : /* How many buckets are there in each partition at a given size? */
133 : #define BUCKETS_PER_PARTITION(size_log2) \
134 : (((size_t) 1) << NUM_SPLITS(size_log2))
135 :
136 : /* Max entries before we need to grow. Half + quarter = 75% load factor. */
137 : #define MAX_COUNT_PER_PARTITION(hash_table) \
138 : (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
139 : BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
140 :
141 : /* Choose partition based on the highest order bits of the hash. */
142 : #define PARTITION_FOR_HASH(hash) \
143 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
144 :
145 : /*
146 : * Find the bucket index for a given hash and table size. Each time the table
147 : * doubles in size, the appropriate bucket for a given hash value doubles and
148 : * possibly adds one, depending on the newly revealed bit, so that all buckets
149 : * are split.
150 : */
151 : #define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \
152 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
153 :
154 : /* The index of the first bucket in a given partition. */
155 : #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
156 : ((partition) << NUM_SPLITS(size_log2))
157 :
158 : /* Choose partition based on bucket index. */
159 : #define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
160 : ((bucket_idx) >> NUM_SPLITS(size_log2))
161 :
162 : /* The head of the active bucket for a given hash value (lvalue). */
163 : #define BUCKET_FOR_HASH(hash_table, hash) \
164 : (hash_table->buckets[ \
165 : BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \
166 : hash_table->size_log2)])
167 :
168 : static void delete_item(dshash_table *hash_table,
169 : dshash_table_item *item);
170 : static void resize(dshash_table *hash_table, size_t new_size_log2);
171 : static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
172 : static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
173 : const void *key,
174 : dsa_pointer item_pointer);
175 : static void insert_item_into_bucket(dshash_table *hash_table,
176 : dsa_pointer item_pointer,
177 : dshash_table_item *item,
178 : dsa_pointer *bucket);
179 : static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
180 : const void *key,
181 : dsa_pointer *bucket);
182 : static bool delete_key_from_bucket(dshash_table *hash_table,
183 : const void *key,
184 : dsa_pointer *bucket_head);
185 : static bool delete_item_from_bucket(dshash_table *hash_table,
186 : dshash_table_item *item,
187 : dsa_pointer *bucket_head);
188 : static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
189 : static inline bool equal_keys(dshash_table *hash_table,
190 : const void *a, const void *b);
191 :
192 : #define PARTITION_LOCK(hash_table, i) \
193 : (&(hash_table)->control->partitions[(i)].lock)
194 :
195 : #define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
196 : Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
197 : DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
198 :
199 : /*
200 : * Create a new hash table backed by the given dynamic shared area, with the
201 : * given parameters. The returned object is allocated in backend-local memory
202 : * using the current MemoryContext. 'arg' will be passed through to the
203 : * compare and hash functions.
204 : */
205 : dshash_table *
2056 andres 206 CBC 1974 : dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
207 : {
208 : dshash_table *hash_table;
209 : dsa_pointer control;
210 :
211 : /* Allocate the backend-local object representing the hash table. */
212 1974 : hash_table = palloc(sizeof(dshash_table));
213 :
214 : /* Allocate the control object in shared memory. */
215 1974 : control = dsa_allocate(area, sizeof(dshash_table_control));
216 :
217 : /* Set up the local and shared hash table structs. */
218 1974 : hash_table->area = area;
219 1974 : hash_table->params = *params;
220 1974 : hash_table->arg = arg;
221 1974 : hash_table->control = dsa_get_address(area, control);
222 1974 : hash_table->control->handle = control;
223 1974 : hash_table->control->magic = DSHASH_MAGIC;
224 1974 : hash_table->control->lwlock_tranche_id = params->tranche_id;
225 :
226 : /* Set up the array of lock partitions. */
227 : {
228 1974 : dshash_partition *partitions = hash_table->control->partitions;
229 1974 : int tranche_id = hash_table->control->lwlock_tranche_id;
230 : int i;
231 :
232 254646 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
233 : {
234 252672 : LWLockInitialize(&partitions[i].lock, tranche_id);
235 252672 : partitions[i].count = 0;
236 : }
237 : }
238 :
239 : /*
240 : * Set up the initial array of buckets. Our initial size is the same as
241 : * the number of partitions.
242 : */
243 1974 : hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
244 3948 : hash_table->control->buckets =
2054 245 1974 : dsa_allocate_extended(area,
246 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
247 : DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
248 1974 : if (!DsaPointerIsValid(hash_table->control->buckets))
249 : {
2054 andres 250 UBC 0 : dsa_free(area, control);
251 0 : ereport(ERROR,
252 : (errcode(ERRCODE_OUT_OF_MEMORY),
253 : errmsg("out of memory"),
254 : errdetail("Failed on DSA request of size %zu.",
255 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
256 : }
2056 andres 257 CBC 3948 : hash_table->buckets = dsa_get_address(area,
258 1974 : hash_table->control->buckets);
2029 259 1974 : hash_table->size_log2 = hash_table->control->size_log2;
260 :
2056 261 1974 : return hash_table;
262 : }
263 :
264 : /*
265 : * Attach to an existing hash table using a handle. The returned object is
266 : * allocated in backend-local memory using the current MemoryContext. 'arg'
267 : * will be passed through to the compare and hash functions.
268 : */
269 : dshash_table *
270 15943 : dshash_attach(dsa_area *area, const dshash_parameters *params,
271 : dshash_table_handle handle, void *arg)
272 : {
273 : dshash_table *hash_table;
274 : dsa_pointer control;
275 :
276 : /* Allocate the backend-local object representing the hash table. */
277 15943 : hash_table = palloc(sizeof(dshash_table));
278 :
279 : /* Find the control object in shared memory. */
280 15943 : control = handle;
281 :
282 : /* Set up the local hash table struct. */
283 15943 : hash_table->area = area;
284 15943 : hash_table->params = *params;
285 15943 : hash_table->arg = arg;
286 15943 : hash_table->control = dsa_get_address(area, control);
287 15943 : Assert(hash_table->control->magic == DSHASH_MAGIC);
288 :
289 : /*
290 : * These will later be set to the correct values by
291 : * ensure_valid_bucket_pointers(), at which time we'll be holding a
292 : * partition lock for interlocking against concurrent resizing.
293 : */
2029 294 15943 : hash_table->buckets = NULL;
295 15943 : hash_table->size_log2 = 0;
296 :
2056 297 15943 : return hash_table;
298 : }
299 :
300 : /*
301 : * Detach from a hash table. This frees backend-local resources associated
302 : * with the hash table, but the hash table will continue to exist until it is
303 : * either explicitly destroyed (by a backend that is still attached to it), or
304 : * the area that backs it is returned to the operating system.
305 : */
306 : void
307 17800 : dshash_detach(dshash_table *hash_table)
308 : {
272 tmunro 309 17800 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
310 :
311 : /* The hash table may have been destroyed. Just free local memory. */
2056 andres 312 17800 : pfree(hash_table);
313 17800 : }
314 :
315 : /*
316 : * Destroy a hash table, returning all memory to the area. The caller must be
317 : * certain that no other backend will attempt to access the hash table before
318 : * calling this function. Other backend must explicitly call dshash_detach to
319 : * free up backend-local memory associated with the hash table. The backend
320 : * that calls dshash_destroy must not call dshash_detach.
321 : */
322 : void
2056 andres 323 UBC 0 : dshash_destroy(dshash_table *hash_table)
324 : {
325 : size_t size;
326 : size_t i;
327 :
328 0 : Assert(hash_table->control->magic == DSHASH_MAGIC);
329 0 : ensure_valid_bucket_pointers(hash_table);
330 :
331 : /* Free all the entries. */
395 332 0 : size = NUM_BUCKETS(hash_table->size_log2);
2056 333 0 : for (i = 0; i < size; ++i)
334 : {
335 0 : dsa_pointer item_pointer = hash_table->buckets[i];
336 :
337 0 : while (DsaPointerIsValid(item_pointer))
338 : {
339 : dshash_table_item *item;
340 : dsa_pointer next_item_pointer;
341 :
342 0 : item = dsa_get_address(hash_table->area, item_pointer);
343 0 : next_item_pointer = item->next;
344 0 : dsa_free(hash_table->area, item_pointer);
345 0 : item_pointer = next_item_pointer;
346 : }
347 : }
348 :
349 : /*
350 : * Vandalize the control block to help catch programming errors where
351 : * other backends access the memory formerly occupied by this hash table.
352 : */
353 0 : hash_table->control->magic = 0;
354 :
355 : /* Free the active table and control object. */
356 0 : dsa_free(hash_table->area, hash_table->control->buckets);
357 0 : dsa_free(hash_table->area, hash_table->control->handle);
358 :
359 0 : pfree(hash_table);
360 0 : }
361 :
362 : /*
363 : * Get a handle that can be used by other processes to attach to this hash
364 : * table.
365 : */
366 : dshash_table_handle
2056 andres 367 CBC 1974 : dshash_get_hash_table_handle(dshash_table *hash_table)
368 : {
369 1974 : Assert(hash_table->control->magic == DSHASH_MAGIC);
370 :
371 1974 : return hash_table->control->handle;
372 : }
373 :
374 : /*
375 : * Look up an entry, given a key. Returns a pointer to an entry if one can be
376 : * found with the given key. Returns NULL if the key is not found. If a
377 : * non-NULL value is returned, the entry is locked and must be released by
378 : * calling dshash_release_lock. If an error is raised before
379 : * dshash_release_lock is called, the lock will be released automatically, but
380 : * the caller must take care to ensure that the entry is not left corrupted.
381 : * The lock mode is either shared or exclusive depending on 'exclusive'.
382 : *
383 : * The caller must not hold a lock already.
384 : *
385 : * Note that the lock held is in fact an LWLock, so interrupts will be held on
386 : * return from this function, and not resumed until dshash_release_lock is
387 : * called. It is a very good idea for the caller to release the lock quickly.
388 : */
389 : void *
390 809097 : dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
391 : {
392 : dshash_hash hash;
393 : size_t partition;
394 : dshash_table_item *item;
395 :
396 809097 : hash = hash_key(hash_table, key);
397 809097 : partition = PARTITION_FOR_HASH(hash);
398 :
399 809097 : Assert(hash_table->control->magic == DSHASH_MAGIC);
272 tmunro 400 809097 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
401 :
2056 andres 402 809097 : LWLockAcquire(PARTITION_LOCK(hash_table, partition),
403 809097 : exclusive ? LW_EXCLUSIVE : LW_SHARED);
404 809097 : ensure_valid_bucket_pointers(hash_table);
405 :
406 : /* Search the active bucket. */
407 809097 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
408 :
409 809097 : if (!item)
410 : {
411 : /* Not found. */
412 339138 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
413 339138 : return NULL;
414 : }
415 : else
416 : {
417 : /* The caller will free the lock by calling dshash_release_lock. */
418 469959 : return ENTRY_FROM_ITEM(item);
419 : }
420 : }
421 :
422 : /*
423 : * Returns a pointer to an exclusively locked item which must be released with
424 : * dshash_release_lock. If the key is found in the hash table, 'found' is set
425 : * to true and a pointer to the existing entry is returned. If the key is not
426 : * found, 'found' is set to false, and a pointer to a newly created entry is
427 : * returned.
428 : *
429 : * Notes above dshash_find() regarding locking and error handling equally
430 : * apply here.
431 : */
432 : void *
433 301269 : dshash_find_or_insert(dshash_table *hash_table,
434 : const void *key,
435 : bool *found)
436 : {
437 : dshash_hash hash;
438 : size_t partition_index;
439 : dshash_partition *partition;
440 : dshash_table_item *item;
441 :
442 301269 : hash = hash_key(hash_table, key);
443 301269 : partition_index = PARTITION_FOR_HASH(hash);
444 301269 : partition = &hash_table->control->partitions[partition_index];
445 :
446 301269 : Assert(hash_table->control->magic == DSHASH_MAGIC);
272 tmunro 447 301269 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
448 :
2056 andres 449 301269 : restart:
450 304933 : LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
451 : LW_EXCLUSIVE);
452 304933 : ensure_valid_bucket_pointers(hash_table);
453 :
454 : /* Search the active bucket. */
455 304933 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
456 :
457 304933 : if (item)
458 18 : *found = true;
459 : else
460 : {
461 304915 : *found = false;
462 :
463 : /* Check if we are getting too full. */
464 304915 : if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
465 : {
466 : /*
467 : * The load factor (= keys / buckets) for all buckets protected by
468 : * this partition is > 0.75. Presumably the same applies
469 : * generally across the whole hash table (though we don't attempt
470 : * to track that directly to avoid contention on some kind of
471 : * central counter; we just assume that this partition is
472 : * representative). This is a good time to resize.
473 : *
474 : * Give up our existing lock first, because resizing needs to
475 : * reacquire all the locks in the right order to avoid deadlocks.
476 : */
477 3664 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
478 3664 : resize(hash_table, hash_table->size_log2 + 1);
479 :
480 3664 : goto restart;
481 : }
482 :
483 : /* Finally we can try to insert the new item. */
484 301251 : item = insert_into_bucket(hash_table, key,
485 301251 : &BUCKET_FOR_HASH(hash_table, hash));
486 301251 : item->hash = hash;
487 : /* Adjust per-lock-partition counter for load factor knowledge. */
488 301251 : ++partition->count;
489 : }
490 :
491 : /* The caller must release the lock with dshash_release_lock. */
492 301269 : return ENTRY_FROM_ITEM(item);
493 : }
494 :
495 : /*
496 : * Remove an entry by key. Returns true if the key was found and the
497 : * corresponding entry was removed.
498 : *
499 : * To delete an entry that you already have a pointer to, see
500 : * dshash_delete_entry.
501 : */
502 : bool
503 108 : dshash_delete_key(dshash_table *hash_table, const void *key)
504 : {
505 : dshash_hash hash;
506 : size_t partition;
507 : bool found;
508 :
509 108 : Assert(hash_table->control->magic == DSHASH_MAGIC);
272 tmunro 510 108 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
511 :
2056 andres 512 108 : hash = hash_key(hash_table, key);
513 108 : partition = PARTITION_FOR_HASH(hash);
514 :
515 108 : LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
516 108 : ensure_valid_bucket_pointers(hash_table);
517 :
518 108 : if (delete_key_from_bucket(hash_table, key,
519 108 : &BUCKET_FOR_HASH(hash_table, hash)))
520 : {
521 76 : Assert(hash_table->control->partitions[partition].count > 0);
522 76 : found = true;
523 76 : --hash_table->control->partitions[partition].count;
524 : }
525 : else
526 32 : found = false;
527 :
528 108 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
529 :
530 108 : return found;
531 : }
532 :
533 : /*
534 : * Remove an entry. The entry must already be exclusively locked, and must
535 : * have been obtained by dshash_find or dshash_find_or_insert. Note that this
536 : * function releases the lock just like dshash_release_lock.
537 : *
538 : * To delete an entry by key, see dshash_delete_key.
539 : */
540 : void
541 28096 : dshash_delete_entry(dshash_table *hash_table, void *entry)
542 : {
543 28096 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
544 28096 : size_t partition = PARTITION_FOR_HASH(item->hash);
545 :
546 28096 : Assert(hash_table->control->magic == DSHASH_MAGIC);
547 28096 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
548 : LW_EXCLUSIVE));
549 :
550 28096 : delete_item(hash_table, item);
551 28096 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
552 28096 : }
553 :
554 : /*
555 : * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
556 : */
557 : void
558 743132 : dshash_release_lock(dshash_table *hash_table, void *entry)
559 : {
560 743132 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
561 743132 : size_t partition_index = PARTITION_FOR_HASH(item->hash);
562 :
563 743132 : Assert(hash_table->control->magic == DSHASH_MAGIC);
564 :
565 743132 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
566 743132 : }
567 :
568 : /*
569 : * A compare function that forwards to memcmp.
570 : */
571 : int
2054 572 155 : dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
573 : {
574 155 : return memcmp(a, b, size);
575 : }
576 :
577 : /*
578 : * A hash function that forwards to tag_hash.
579 : */
580 : dshash_hash
581 403 : dshash_memhash(const void *v, size_t size, void *arg)
582 : {
583 403 : return tag_hash(v, size);
584 : }
585 :
586 : /*
587 : * Sequentially scan through dshash table and return all the elements one by
588 : * one, return NULL when all elements have been returned.
589 : *
590 : * dshash_seq_term needs to be called when a scan finished. The caller may
591 : * delete returned elements midst of a scan by using dshash_delete_current()
592 : * if exclusive = true.
593 : */
594 : void
395 595 1462 : dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
596 : bool exclusive)
597 : {
598 1462 : status->hash_table = hash_table;
599 1462 : status->curbucket = 0;
600 1462 : status->nbuckets = 0;
601 1462 : status->curitem = NULL;
602 1462 : status->pnextitem = InvalidDsaPointer;
603 1462 : status->curpartition = -1;
604 1462 : status->exclusive = exclusive;
605 1462 : }
606 :
607 : /*
608 : * Returns the next element.
609 : *
610 : * Returned elements are locked and the caller may not release the lock. It is
611 : * released by future calls to dshash_seq_next() or dshash_seq_term().
612 : */
613 : void *
614 259987 : dshash_seq_next(dshash_seq_status *status)
615 : {
616 : dsa_pointer next_item_pointer;
617 :
618 : /*
619 : * Not yet holding any partition locks. Need to determine the size of the
620 : * hash table, it could have been resized since we were looking last.
621 : * Since we iterate in partition order, we can start by unconditionally
622 : * lock partition 0.
623 : *
624 : * Once we hold the lock, no resizing can happen until the scan ends. So
625 : * we don't need to repeatedly call ensure_valid_bucket_pointers().
626 : */
370 627 259987 : if (status->curpartition == -1)
628 : {
395 629 1462 : Assert(status->curbucket == 0);
272 tmunro 630 1462 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
631 :
370 andres 632 1462 : status->curpartition = 0;
633 :
634 1462 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
635 : status->curpartition),
395 636 1462 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
637 :
638 1462 : ensure_valid_bucket_pointers(status->hash_table);
639 :
370 640 1462 : status->nbuckets =
641 1462 : NUM_BUCKETS(status->hash_table->control->size_log2);
395 642 1462 : next_item_pointer = status->hash_table->buckets[status->curbucket];
643 : }
644 : else
645 258525 : next_item_pointer = status->pnextitem;
646 :
647 259987 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
648 : status->curpartition),
649 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
650 :
651 : /* Move to the next bucket if we finished the current bucket */
652 1910365 : while (!DsaPointerIsValid(next_item_pointer))
653 : {
654 : int next_partition;
655 :
656 1651840 : if (++status->curbucket >= status->nbuckets)
657 : {
658 : /* all buckets have been scanned. finish. */
659 1462 : return NULL;
660 : }
661 :
662 : /* Check if move to the next partition */
663 1650378 : next_partition =
664 1650378 : PARTITION_FOR_BUCKET_INDEX(status->curbucket,
665 : status->hash_table->size_log2);
666 :
667 1650378 : if (status->curpartition != next_partition)
668 : {
669 : /*
670 : * Move to the next partition. Lock the next partition then
671 : * release the current, not in the reverse order to avoid
672 : * concurrent resizing. Avoid dead lock by taking lock in the
673 : * same order with resize().
674 : */
675 185674 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
676 : next_partition),
677 185674 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
678 185674 : LWLockRelease(PARTITION_LOCK(status->hash_table,
679 : status->curpartition));
680 185674 : status->curpartition = next_partition;
681 : }
682 :
683 1650378 : next_item_pointer = status->hash_table->buckets[status->curbucket];
684 : }
685 :
686 258525 : status->curitem =
687 258525 : dsa_get_address(status->hash_table->area, next_item_pointer);
688 :
689 : /*
690 : * The caller may delete the item. Store the next item in case of
691 : * deletion.
692 : */
693 258525 : status->pnextitem = status->curitem->next;
694 :
695 258525 : return ENTRY_FROM_ITEM(status->curitem);
696 : }
697 :
698 : /*
699 : * Terminates the seqscan and release all locks.
700 : *
701 : * Needs to be called after finishing or when exiting a seqscan.
702 : */
703 : void
704 1462 : dshash_seq_term(dshash_seq_status *status)
705 : {
706 1462 : if (status->curpartition >= 0)
707 1462 : LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
708 1462 : }
709 :
710 : /*
711 : * Remove the current entry of the seq scan.
712 : */
713 : void
714 1061 : dshash_delete_current(dshash_seq_status *status)
715 : {
716 1061 : dshash_table *hash_table = status->hash_table;
717 1061 : dshash_table_item *item = status->curitem;
718 : size_t partition PG_USED_FOR_ASSERTS_ONLY;
719 :
720 1061 : partition = PARTITION_FOR_HASH(item->hash);
721 :
722 1061 : Assert(status->exclusive);
723 1061 : Assert(hash_table->control->magic == DSHASH_MAGIC);
724 1061 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
725 : LW_EXCLUSIVE));
726 :
727 1061 : delete_item(hash_table, item);
728 1061 : }
729 :
730 : /*
731 : * Print debugging information about the internal state of the hash table to
732 : * stderr. The caller must hold no partition locks.
733 : */
734 : void
2056 andres 735 UBC 0 : dshash_dump(dshash_table *hash_table)
736 : {
737 : size_t i;
738 : size_t j;
739 :
740 0 : Assert(hash_table->control->magic == DSHASH_MAGIC);
272 tmunro 741 0 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
742 :
2056 andres 743 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
744 : {
745 0 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
746 0 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
747 : }
748 :
749 0 : ensure_valid_bucket_pointers(hash_table);
750 :
751 0 : fprintf(stderr,
752 0 : "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
753 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
754 : {
755 0 : dshash_partition *partition = &hash_table->control->partitions[i];
756 0 : size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
757 0 : size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
758 :
759 0 : fprintf(stderr, " partition %zu\n", i);
760 0 : fprintf(stderr,
761 : " active buckets (key count = %zu)\n", partition->count);
762 :
763 0 : for (j = begin; j < end; ++j)
764 : {
765 0 : size_t count = 0;
766 0 : dsa_pointer bucket = hash_table->buckets[j];
767 :
768 0 : while (DsaPointerIsValid(bucket))
769 : {
770 : dshash_table_item *item;
771 :
772 0 : item = dsa_get_address(hash_table->area, bucket);
773 :
774 0 : bucket = item->next;
775 0 : ++count;
776 : }
777 0 : fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count);
778 : }
779 : }
780 :
781 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
782 0 : LWLockRelease(PARTITION_LOCK(hash_table, i));
783 0 : }
784 :
785 : /*
786 : * Delete a locked item to which we have a pointer.
787 : */
788 : static void
2056 andres 789 CBC 29157 : delete_item(dshash_table *hash_table, dshash_table_item *item)
790 : {
791 29157 : size_t hash = item->hash;
792 29157 : size_t partition = PARTITION_FOR_HASH(hash);
793 :
794 29157 : Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
795 :
796 29157 : if (delete_item_from_bucket(hash_table, item,
797 29157 : &BUCKET_FOR_HASH(hash_table, hash)))
798 : {
799 29157 : Assert(hash_table->control->partitions[partition].count > 0);
800 29157 : --hash_table->control->partitions[partition].count;
801 : }
802 : else
803 : {
2056 andres 804 UBC 0 : Assert(false);
805 : }
2056 andres 806 CBC 29157 : }
807 :
808 : /*
809 : * Grow the hash table if necessary to the requested number of buckets. The
810 : * requested size must be double some previously observed size.
811 : *
812 : * Must be called without any partition lock held.
813 : */
814 : static void
815 3664 : resize(dshash_table *hash_table, size_t new_size_log2)
816 : {
817 : dsa_pointer old_buckets;
818 : dsa_pointer new_buckets_shared;
819 : dsa_pointer *new_buckets;
820 : size_t size;
2044 tgl 821 3664 : size_t new_size = ((size_t) 1) << new_size_log2;
822 : size_t i;
823 :
824 : /*
825 : * Acquire the locks for all lock partitions. This is expensive, but we
826 : * shouldn't have to do it many times.
827 : */
2056 andres 828 472656 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
829 : {
830 468992 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
831 :
832 468992 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
833 468992 : if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
834 : {
835 : /*
836 : * Another backend has already increased the size; we can avoid
837 : * obtaining all the locks and return early.
838 : */
2056 andres 839 UBC 0 : LWLockRelease(PARTITION_LOCK(hash_table, 0));
840 0 : return;
841 : }
842 : }
843 :
2056 andres 844 CBC 3664 : Assert(new_size_log2 == hash_table->control->size_log2 + 1);
845 :
846 : /* Allocate the space for the new table. */
847 3664 : new_buckets_shared = dsa_allocate0(hash_table->area,
848 : sizeof(dsa_pointer) * new_size);
849 3664 : new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
850 :
851 : /*
852 : * We've allocated the new bucket array; all that remains to do now is to
853 : * reinsert all items, which amounts to adjusting all the pointers.
854 : */
2044 tgl 855 3664 : size = ((size_t) 1) << hash_table->control->size_log2;
2056 andres 856 1565904 : for (i = 0; i < size; ++i)
857 : {
858 1562240 : dsa_pointer item_pointer = hash_table->buckets[i];
859 :
860 1725682 : while (DsaPointerIsValid(item_pointer))
861 : {
862 : dshash_table_item *item;
863 : dsa_pointer next_item_pointer;
864 :
865 163442 : item = dsa_get_address(hash_table->area, item_pointer);
866 163442 : next_item_pointer = item->next;
867 163442 : insert_item_into_bucket(hash_table, item_pointer, item,
868 163442 : &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
869 : new_size_log2)]);
870 163442 : item_pointer = next_item_pointer;
871 : }
872 : }
873 :
874 : /* Swap the hash table into place and free the old one. */
875 3664 : old_buckets = hash_table->control->buckets;
876 3664 : hash_table->control->buckets = new_buckets_shared;
877 3664 : hash_table->control->size_log2 = new_size_log2;
878 3664 : hash_table->buckets = new_buckets;
879 3664 : dsa_free(hash_table->area, old_buckets);
880 :
881 : /* Release all the locks. */
882 472656 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
883 468992 : LWLockRelease(PARTITION_LOCK(hash_table, i));
884 : }
885 :
886 : /*
887 : * Make sure that our backend-local bucket pointers are up to date. The
888 : * caller must have locked one lock partition, which prevents resize() from
889 : * running concurrently.
890 : */
891 : static inline void
892 1115600 : ensure_valid_bucket_pointers(dshash_table *hash_table)
893 : {
894 1115600 : if (hash_table->size_log2 != hash_table->control->size_log2)
895 : {
896 32594 : hash_table->buckets = dsa_get_address(hash_table->area,
897 16297 : hash_table->control->buckets);
898 16297 : hash_table->size_log2 = hash_table->control->size_log2;
899 : }
900 1115600 : }
901 :
902 : /*
903 : * Scan a locked bucket for a match, using the provided compare function.
904 : */
905 : static inline dshash_table_item *
906 1114030 : find_in_bucket(dshash_table *hash_table, const void *key,
907 : dsa_pointer item_pointer)
908 : {
909 1271555 : while (DsaPointerIsValid(item_pointer))
910 : {
911 : dshash_table_item *item;
912 :
913 627502 : item = dsa_get_address(hash_table->area, item_pointer);
914 627502 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
915 469977 : return item;
916 157525 : item_pointer = item->next;
917 : }
918 644053 : return NULL;
919 : }
920 :
921 : /*
922 : * Insert an already-allocated item into a bucket.
923 : */
924 : static void
925 464693 : insert_item_into_bucket(dshash_table *hash_table,
926 : dsa_pointer item_pointer,
927 : dshash_table_item *item,
928 : dsa_pointer *bucket)
929 : {
930 464693 : Assert(item == dsa_get_address(hash_table->area, item_pointer));
931 :
932 464693 : item->next = *bucket;
933 464693 : *bucket = item_pointer;
934 464693 : }
935 :
936 : /*
937 : * Allocate space for an entry with the given key and insert it into the
938 : * provided bucket.
939 : */
940 : static dshash_table_item *
941 301251 : insert_into_bucket(dshash_table *hash_table,
942 : const void *key,
943 : dsa_pointer *bucket)
944 : {
945 : dsa_pointer item_pointer;
946 : dshash_table_item *item;
947 :
948 301251 : item_pointer = dsa_allocate(hash_table->area,
949 : hash_table->params.entry_size +
950 : MAXALIGN(sizeof(dshash_table_item)));
951 301251 : item = dsa_get_address(hash_table->area, item_pointer);
952 301251 : memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size);
953 301251 : insert_item_into_bucket(hash_table, item_pointer, item, bucket);
954 301251 : return item;
955 : }
956 :
957 : /*
958 : * Search a bucket for a matching key and delete it.
959 : */
960 : static bool
961 108 : delete_key_from_bucket(dshash_table *hash_table,
962 : const void *key,
963 : dsa_pointer *bucket_head)
964 : {
965 108 : while (DsaPointerIsValid(*bucket_head))
966 : {
967 : dshash_table_item *item;
968 :
969 76 : item = dsa_get_address(hash_table->area, *bucket_head);
970 :
971 76 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
972 : {
973 : dsa_pointer next;
974 :
975 76 : next = item->next;
976 76 : dsa_free(hash_table->area, *bucket_head);
977 76 : *bucket_head = next;
978 :
979 76 : return true;
980 : }
2056 andres 981 UBC 0 : bucket_head = &item->next;
982 : }
2056 andres 983 CBC 32 : return false;
984 : }
985 :
986 : /*
987 : * Delete the specified item from the bucket.
988 : */
989 : static bool
990 29157 : delete_item_from_bucket(dshash_table *hash_table,
991 : dshash_table_item *item,
992 : dsa_pointer *bucket_head)
993 : {
994 29331 : while (DsaPointerIsValid(*bucket_head))
995 : {
996 : dshash_table_item *bucket_item;
997 :
998 29331 : bucket_item = dsa_get_address(hash_table->area, *bucket_head);
999 :
1000 29331 : if (bucket_item == item)
1001 : {
1002 : dsa_pointer next;
1003 :
1004 29157 : next = item->next;
1005 29157 : dsa_free(hash_table->area, *bucket_head);
1006 29157 : *bucket_head = next;
1007 29157 : return true;
1008 : }
1009 174 : bucket_head = &bucket_item->next;
1010 : }
2056 andres 1011 UBC 0 : return false;
1012 : }
1013 :
1014 : /*
1015 : * Compute the hash value for a key.
1016 : */
1017 : static inline dshash_hash
2056 andres 1018 CBC 1110474 : hash_key(dshash_table *hash_table, const void *key)
1019 : {
2054 1020 1110474 : return hash_table->params.hash_function(key,
1021 : hash_table->params.key_size,
1022 : hash_table->arg);
1023 : }
1024 :
1025 : /*
1026 : * Check whether two keys compare equal.
1027 : */
1028 : static inline bool
2056 1029 627578 : equal_keys(dshash_table *hash_table, const void *a, const void *b)
1030 : {
2054 1031 627578 : return hash_table->params.compare_function(a, b,
1032 : hash_table->params.key_size,
1033 627578 : hash_table->arg) == 0;
1034 : }
|