Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dynahash.c
4 : * dynamic chained hash tables
5 : *
6 : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : * shared memory. For shared hash tables, it is the caller's responsibility
8 : * to provide appropriate access interlocking. The simplest convention is
9 : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : * Therefore, each hash bucket chain operates independently, and no fields
17 : * of the hash header change after init except nentries and freeList.
18 : * (A partitioned table uses multiple copies of those fields, guarded by
19 : * spinlocks, for additional concurrency.)
20 : * This lets any subset of the hash buckets be treated as a separately
21 : * lockable partition. We expect callers to use the low-order bits of a
22 : * lookup key's hash value as a partition number --- this will work because
23 : * of the way calc_bucket() maps hash values to bucket numbers.
24 : *
25 : * For hash tables in shared memory, the memory allocator function should
26 : * match malloc's semantics of returning NULL on failure. For hash tables
27 : * in local memory, we typically use palloc() which will throw error on
28 : * failure. The code in this file has to cope with both cases.
29 : *
30 : * dynahash.c provides support for these types of lookup keys:
31 : *
32 : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 : * compared as though by strcmp(). This is selected by specifying the
34 : * HASH_STRINGS flag to hash_create.
35 : *
36 : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
37 : * (Caller must ensure there are no undefined padding bits in the keys!)
38 : * This is selected by specifying the HASH_BLOBS flag to hash_create.
39 : *
40 : * 3. More complex key behavior can be selected by specifying user-supplied
41 : * hashing, comparison, and/or key-copying functions. At least a hashing
42 : * function must be supplied; comparison defaults to memcmp() and key copying
43 : * to memcpy() when a user-defined hashing function is selected.
44 : *
45 : * Compared to simplehash, dynahash has the following benefits:
46 : *
47 : * - It supports partitioning, which is useful for shared memory access using
48 : * locks.
49 : * - Shared memory hashes are allocated in a fixed size area at startup and
50 : * are discoverable by name from other processes.
51 : * - Because entries don't need to be moved in the case of hash conflicts,
52 : * dynahash has better performance for large entries.
53 : * - Guarantees stable pointers to entries.
54 : *
55 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
56 : * Portions Copyright (c) 1994, Regents of the University of California
57 : *
58 : *
59 : * IDENTIFICATION
60 : * src/backend/utils/hash/dynahash.c
61 : *
62 : *-------------------------------------------------------------------------
63 : */
64 :
65 : /*
66 : * Original comments:
67 : *
68 : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
69 : * Coded into C, with minor code improvements, and with hsearch(3) interface,
70 : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
71 : * also, hcreate/hdestroy routines added to simulate hsearch(3).
72 : *
73 : * These routines simulate hsearch(3) and family, with the important
74 : * difference that the hash table is dynamic - can grow indefinitely
75 : * beyond its original size (as supplied to hcreate()).
76 : *
77 : * Performance appears to be comparable to that of hsearch(3).
78 : * The 'source-code' options referred to in hsearch(3)'s 'man' page
79 : * are not implemented; otherwise functionality is identical.
80 : *
81 : * Compilation controls:
82 : * HASH_DEBUG controls some informative traces, mainly for debugging.
83 : * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
84 : * when combined with HASH_DEBUG, these are displayed by hdestroy().
85 : *
86 : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : * concatenation property, in probably unnecessary code 'optimization'.
88 : *
89 : * Modified margo@postgres.berkeley.edu February 1990
90 : * added multiple table interface
91 : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : * changed ctl structure for shared memory
93 : */
94 :
95 : #include "postgres.h"
96 :
97 : #include <limits.h>
98 :
99 : #include "access/xact.h"
100 : #include "common/hashfn.h"
101 : #include "port/pg_bitutils.h"
102 : #include "storage/shmem.h"
103 : #include "storage/spin.h"
104 : #include "utils/dynahash.h"
105 : #include "utils/memutils.h"
106 :
107 :
108 : /*
109 : * Constants
110 : *
111 : * A hash table has a top-level "directory", each of whose entries points
112 : * to a "segment" of ssize bucket headers. The maximum number of hash
113 : * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 : * the number of records in the table can be larger, but we don't want a
115 : * whole lot of records per bucket or performance goes down.
116 : *
117 : * In a hash table allocated in shared memory, the directory cannot be
118 : * expanded because it must stay at a fixed address. The directory size
119 : * should be selected using hash_select_dirsize (and you'd better have
120 : * a good idea of the maximum number of entries!). For non-shared hash
121 : * tables, the initial directory size can be left at the default.
122 : */
123 : #define DEF_SEGSIZE 256
124 : #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 : #define DEF_DIRSIZE 256
126 :
127 : /* Number of freelists to be used for a partitioned hash table. */
128 : #define NUM_FREELISTS 32
129 :
130 : /* A hash bucket is a linked list of HASHELEMENTs */
131 : typedef HASHELEMENT *HASHBUCKET;
132 :
133 : /* A hash segment is an array of bucket headers */
134 : typedef HASHBUCKET *HASHSEGMENT;
135 :
136 : /*
137 : * Per-freelist data.
138 : *
139 : * In a partitioned hash table, each freelist is associated with a specific
140 : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 : * nentries tracks the number of live hashtable entries having those hashcodes
142 : * (NOT the number of entries in the freelist, as you might expect).
143 : *
144 : * The coverage of a freelist might be more or less than one partition, so it
145 : * needs its own lock rather than relying on caller locking. Relying on that
146 : * wouldn't work even if the coverage was the same, because of the occasional
147 : * need to "borrow" entries from another freelist; see get_hash_entry().
148 : *
149 : * Using an array of FreeListData instead of separate arrays of mutexes,
150 : * nentries and freeLists helps to reduce sharing of cache lines between
151 : * different mutexes.
152 : */
153 : typedef struct
154 : {
155 : slock_t mutex; /* spinlock for this freelist */
156 : long nentries; /* number of entries in associated buckets */
157 : HASHELEMENT *freeList; /* chain of free elements */
158 : } FreeListData;
159 :
160 : /*
161 : * Header structure for a hash table --- contains all changeable info
162 : *
163 : * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 : * each backend has a local HTAB struct. For a non-shared table, there isn't
165 : * any functional difference between HASHHDR and HTAB, but we separate them
166 : * anyway to share code between shared and non-shared tables.
167 : */
168 : struct HASHHDR
169 : {
170 : /*
171 : * The freelist can become a point of contention in high-concurrency hash
172 : * tables, so we use an array of freelists, each with its own mutex and
173 : * nentries count, instead of just a single one. Although the freelists
174 : * normally operate independently, we will scavenge entries from freelists
175 : * other than a hashcode's default freelist when necessary.
176 : *
177 : * If the hash table is not partitioned, only freeList[0] is used and its
178 : * spinlock is not used at all; callers' locking is assumed sufficient.
179 : */
180 : FreeListData freeList[NUM_FREELISTS];
181 :
182 : /* These fields can change, but not in a partitioned table */
183 : /* Also, dsize can't change in a shared table, even if unpartitioned */
184 : long dsize; /* directory size */
185 : long nsegs; /* number of allocated segments (<= dsize) */
186 : uint32 max_bucket; /* ID of maximum bucket in use */
187 : uint32 high_mask; /* mask to modulo into entire table */
188 : uint32 low_mask; /* mask to modulo into lower half of table */
189 :
190 : /* These fields are fixed at hashtable creation */
191 : Size keysize; /* hash key length in bytes */
192 : Size entrysize; /* total user element size in bytes */
193 : long num_partitions; /* # partitions (must be power of 2), or 0 */
194 : long max_dsize; /* 'dsize' limit if directory is fixed size */
195 : long ssize; /* segment size --- must be power of 2 */
196 : int sshift; /* segment shift = log2(ssize) */
197 : int nelem_alloc; /* number of entries to allocate at once */
198 :
199 : #ifdef HASH_STATISTICS
200 :
201 : /*
202 : * Count statistics here. NB: stats code doesn't bother with mutex, so
203 : * counts could be corrupted a bit in a partitioned table.
204 : */
205 : long accesses;
206 : long collisions;
207 : #endif
208 : };
209 :
210 : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
211 :
212 : #define FREELIST_IDX(hctl, hashcode) \
213 : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
214 :
215 : /*
216 : * Top control structure for a hashtable --- in a shared table, each backend
217 : * has its own copy (OK since no fields change at runtime)
218 : */
219 : struct HTAB
220 : {
221 : HASHHDR *hctl; /* => shared control information */
222 : HASHSEGMENT *dir; /* directory of segment starts */
223 : HashValueFunc hash; /* hash function */
224 : HashCompareFunc match; /* key comparison function */
225 : HashCopyFunc keycopy; /* key copying function */
226 : HashAllocFunc alloc; /* memory allocator */
227 : MemoryContext hcxt; /* memory context if default allocator used */
228 : char *tabname; /* table name (for error messages) */
229 : bool isshared; /* true if table is in shared memory */
230 : bool isfixed; /* if true, don't enlarge */
231 :
232 : /* freezing a shared table isn't allowed, so we can keep state here */
233 : bool frozen; /* true = no more inserts allowed */
234 :
235 : /* We keep local copies of these fixed values to reduce contention */
236 : Size keysize; /* hash key length in bytes */
237 : long ssize; /* segment size --- must be power of 2 */
238 : int sshift; /* segment shift = log2(ssize) */
239 : };
240 :
241 : /*
242 : * Key (also entry) part of a HASHELEMENT
243 : */
244 : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
245 :
246 : /*
247 : * Obtain element pointer given pointer to key
248 : */
249 : #define ELEMENT_FROM_KEY(key) \
250 : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
251 :
252 : /*
253 : * Fast MOD arithmetic, assuming that y is a power of 2 !
254 : */
255 : #define MOD(x,y) ((x) & ((y)-1))
256 :
257 : #ifdef HASH_STATISTICS
258 : static long hash_accesses,
259 : hash_collisions,
260 : hash_expansions;
261 : #endif
262 :
263 : /*
264 : * Private function prototypes
265 : */
266 : static void *DynaHashAlloc(Size size);
267 : static HASHSEGMENT seg_alloc(HTAB *hashp);
268 : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
269 : static bool dir_realloc(HTAB *hashp);
270 : static bool expand_table(HTAB *hashp);
271 : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
272 : static void hdefault(HTAB *hashp);
273 : static int choose_nelem_alloc(Size entrysize);
274 : static bool init_htab(HTAB *hashp, long nelem);
275 : static void hash_corrupted(HTAB *hashp);
276 : static long next_pow2_long(long num);
277 : static int next_pow2_int(long num);
278 : static void register_seq_scan(HTAB *hashp);
279 : static void deregister_seq_scan(HTAB *hashp);
280 : static bool has_seq_scans(HTAB *hashp);
281 :
282 :
283 : /*
284 : * memory allocation support
285 : */
286 : static MemoryContext CurrentDynaHashCxt = NULL;
287 :
288 : static void *
8320 tgl 289 CBC 1206562 : DynaHashAlloc(Size size)
290 : {
7961 JanWieck 291 1206562 : Assert(MemoryContextIsValid(CurrentDynaHashCxt));
177 tgl 292 GNC 1206562 : return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
293 : MCXT_ALLOC_NO_OOM);
294 : }
295 :
296 :
297 : /*
298 : * HashCompareFunc for string keys
299 : *
300 : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
301 : * bytes, so we can only compare that many ... hence strncmp is almost but
302 : * not quite the right thing.
303 : */
304 : static int
6038 tgl 305 GIC 593000 : string_compare(const char *key1, const char *key2, Size keysize)
6038 tgl 306 ECB : {
6038 tgl 307 GIC 593000 : return strncmp(key1, key2, keysize - 1);
6038 tgl 308 ECB : }
309 :
310 :
311 : /************************** CREATE ROUTINES **********************/
312 :
313 : /*
314 : * hash_create -- create a new dynamic hash table
315 : *
316 : * tabname: a name for the table (for debugging purposes)
317 : * nelem: maximum number of elements expected
318 : * *info: additional table parameters, as indicated by flags
319 : * flags: bitmask indicating which parameters to take from *info
320 : *
321 : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
322 : * optional, but the default keysize and entrysize values were useless.)
323 : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
324 : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
325 : * binary blobs, or custom, respectively). Callers specifying a custom
326 : * hash function will likely also want to use HASH_COMPARE, and perhaps
327 : * also HASH_KEYCOPY, to control key comparison and copying.
328 : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
329 : * under info->hcxt rather than under TopMemoryContext; the default
330 : * behavior is only suitable for session-lifespan hash tables.
331 : * Other flags bits are special-purpose and seldom used, except for those
332 : * associated with shared-memory hash tables, for which see ShmemInitHash().
333 : *
334 : * Fields in *info are read only when the associated flags bit is set.
335 : * It is not necessary to initialize other fields of *info.
336 : * Neither tabname nor *info need persist after the hash_create() call.
337 : *
338 : * Note: It is deprecated for callers of hash_create() to explicitly specify
339 : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
340 : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
341 : * one of these.
342 : *
343 : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
344 : * estimate, since we can't expand the table on the fly. But an unshared
345 : * hashtable can be expanded on-the-fly, so it's better for nelem to be
346 : * on the small side and let the table grow if it's exceeded. An overly
347 : * large nelem will penalize hash_seq_search speed without buying much.
348 : */
349 : HTAB *
845 tgl 350 GIC 227592 : hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
9770 scrappy 351 ECB : {
352 : HTAB *hashp;
353 : HASHHDR *hctl;
354 :
355 : /*
356 : * Hash tables now allocate space for key and data, but you have to say
357 : * how much space to allocate.
358 : */
845 tgl 359 GIC 227592 : Assert(flags & HASH_ELEM);
845 tgl 360 CBC 227592 : Assert(info->keysize > 0);
361 227592 : Assert(info->entrysize >= info->keysize);
845 tgl 362 ECB :
363 : /*
364 : * For shared hash tables, we have a local hash header (HTAB struct) that
365 : * we allocate in TopMemoryContext; all else is in shared memory.
366 : *
367 : * For non-shared hash tables, everything including the hash header is in
368 : * a memory context created specially for the hash table --- this makes
369 : * hash_destroy very simple. The memory context is made a child of either
370 : * a context specified by the caller, or TopMemoryContext if nothing is
371 : * specified.
372 : */
6547 tgl 373 GIC 227592 : if (flags & HASH_SHARED_MEM)
6547 tgl 374 ECB : {
375 : /* Set up to allocate the hash header */
6547 tgl 376 GIC 12785 : CurrentDynaHashCxt = TopMemoryContext;
6547 tgl 377 ECB : }
378 : else
379 : {
380 : /* Create the hash table's private memory context */
6547 tgl 381 GIC 214807 : if (flags & HASH_CONTEXT)
6547 tgl 382 CBC 128360 : CurrentDynaHashCxt = info->hcxt;
6547 tgl 383 ECB : else
6547 tgl 384 GIC 86447 : CurrentDynaHashCxt = TopMemoryContext;
1839 tgl 385 CBC 214807 : CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
1839 tgl 386 ECB : "dynahash",
387 : ALLOCSET_DEFAULT_SIZES);
388 : }
389 :
390 : /* Initialize the hash header, plus a copy of the table name */
2118 tgl 391 GIC 227592 : hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
9334 bruce 392 CBC 2958696 : MemSet(hashp, 0, sizeof(HTAB));
9345 bruce 393 ECB :
6547 tgl 394 GIC 227592 : hashp->tabname = (char *) (hashp + 1);
7856 tgl 395 CBC 227592 : strcpy(hashp->tabname, tabname);
7856 tgl 396 ECB :
397 : /* If we have a private context, label it with hashtable's name */
1839 tgl 398 GIC 227592 : if (!(flags & HASH_SHARED_MEM))
1839 tgl 399 CBC 214807 : MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
1839 tgl 400 ECB :
401 : /*
402 : * Select the appropriate hash function (see comments at head of file).
403 : */
9345 bruce 404 GIC 227592 : if (flags & HASH_FUNCTION)
845 tgl 405 ECB : {
845 tgl 406 GIC 14270 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
9345 bruce 407 CBC 14270 : hashp->hash = info->hash;
845 tgl 408 ECB : }
3034 tgl 409 GIC 213322 : else if (flags & HASH_BLOBS)
3034 tgl 410 ECB : {
845 tgl 411 GIC 173060 : Assert(!(flags & HASH_STRINGS));
3034 tgl 412 ECB : /* We can optimize hashing for common key sizes */
3034 tgl 413 GIC 173060 : if (info->keysize == sizeof(uint32))
3034 tgl 414 CBC 77104 : hashp->hash = uint32_hash;
3034 tgl 415 ECB : else
3034 tgl 416 GIC 95956 : hashp->hash = tag_hash;
3034 tgl 417 ECB : }
418 : else
419 : {
420 : /*
421 : * string_hash used to be considered the default hash method, and in a
422 : * non-assert build it effectively still is. But we now consider it
423 : * an assertion error to not say HASH_STRINGS explicitly. To help
424 : * catch mistaken usage of HASH_STRINGS, we also insist on a
425 : * reasonably long string length: if the keysize is only 4 or 8 bytes,
426 : * it's almost certainly an integer or pointer not a string.
427 : */
845 tgl 428 GIC 40262 : Assert(flags & HASH_STRINGS);
845 tgl 429 CBC 40262 : Assert(info->keysize > 8);
845 tgl 430 ECB :
845 tgl 431 GIC 40262 : hashp->hash = string_hash;
845 tgl 432 ECB : }
433 :
434 : /*
435 : * If you don't specify a match function, it defaults to string_compare if
436 : * you used string_hash, and to memcmp otherwise.
437 : *
438 : * Note: explicitly specifying string_hash is deprecated, because this
439 : * might not work for callers in loadable modules on some platforms due to
440 : * referencing a trampoline instead of the string_hash function proper.
441 : * Specify HASH_STRINGS instead.
442 : */
7173 tgl 443 GIC 227592 : if (flags & HASH_COMPARE)
7173 tgl 444 CBC 10618 : hashp->match = info->match;
445 216974 : else if (hashp->hash == string_hash)
6038 446 40262 : hashp->match = (HashCompareFunc) string_compare;
7173 tgl 447 ECB : else
7173 tgl 448 GIC 176712 : hashp->match = memcmp;
7173 tgl 449 ECB :
450 : /*
451 : * Similarly, the key-copying function defaults to strlcpy or memcpy.
452 : */
6504 tgl 453 GIC 227592 : if (flags & HASH_KEYCOPY)
6504 tgl 454 LBC 0 : hashp->keycopy = info->keycopy;
6504 tgl 455 GBC 227592 : else if (hashp->hash == string_hash)
999 peter 456 ECB : {
457 : /*
458 : * The signature of keycopy is meant for memcpy(), which returns
459 : * void*, but strlcpy() returns size_t. Since we never use the return
460 : * value of keycopy, and size_t is pretty much always the same size as
461 : * void *, this should be safe. The extra cast in the middle is to
462 : * avoid warnings from -Wcast-function-type.
463 : */
999 peter 464 GIC 40262 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
999 peter 465 ECB : }
466 : else
6504 tgl 467 GIC 187330 : hashp->keycopy = memcpy;
6504 tgl 468 ECB :
469 : /* And select the entry allocation function, too. */
6547 tgl 470 GIC 227592 : if (flags & HASH_ALLOC)
6547 tgl 471 CBC 12785 : hashp->alloc = info->alloc;
6547 tgl 472 ECB : else
6547 tgl 473 GIC 214807 : hashp->alloc = DynaHashAlloc;
6547 tgl 474 ECB :
9345 bruce 475 GIC 227592 : if (flags & HASH_SHARED_MEM)
9345 bruce 476 ECB : {
477 : /*
478 : * ctl structure and directory are preallocated for shared memory
479 : * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
480 : * well.
481 : */
7860 tgl 482 GIC 12785 : hashp->hctl = info->hctl;
6105 tgl 483 CBC 12785 : hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
7961 JanWieck 484 12785 : hashp->hcxt = NULL;
7856 tgl 485 12785 : hashp->isshared = true;
9345 bruce 486 ECB :
487 : /* hash table already exists, we're just attaching to it */
9345 bruce 488 GIC 12785 : if (flags & HASH_ATTACH)
6105 tgl 489 ECB : {
490 : /* make local copies of some heavily-used values */
6105 tgl 491 UIC 0 : hctl = hashp->hctl;
6105 tgl 492 UBC 0 : hashp->keysize = hctl->keysize;
493 0 : hashp->ssize = hctl->ssize;
494 0 : hashp->sshift = hctl->sshift;
6105 tgl 495 EUB :
8986 bruce 496 UIC 0 : return hashp;
6105 tgl 497 EUB : }
498 : }
499 : else
500 : {
501 : /* setup hash table defaults */
8812 tgl 502 GIC 214807 : hashp->hctl = NULL;
9345 bruce 503 CBC 214807 : hashp->dir = NULL;
7173 tgl 504 214807 : hashp->hcxt = CurrentDynaHashCxt;
7856 505 214807 : hashp->isshared = false;
9770 scrappy 506 ECB : }
507 :
9345 bruce 508 GIC 227592 : if (!hashp->hctl)
9345 bruce 509 ECB : {
7860 tgl 510 GIC 214807 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
9345 bruce 511 CBC 214807 : if (!hashp->hctl)
6740 neilc 512 LBC 0 : ereport(ERROR,
6740 neilc 513 EUB : (errcode(ERRCODE_OUT_OF_MEMORY),
514 : errmsg("out of memory")));
515 : }
516 :
5827 tgl 517 GIC 227592 : hashp->frozen = false;
5827 tgl 518 ECB :
6743 neilc 519 GIC 227592 : hdefault(hashp);
7860 tgl 520 ECB :
9345 bruce 521 GIC 227592 : hctl = hashp->hctl;
6105 tgl 522 ECB :
6105 tgl 523 GIC 227592 : if (flags & HASH_PARTITION)
6105 tgl 524 ECB : {
525 : /* Doesn't make sense to partition a local hash table */
6105 tgl 526 GIC 9130 : Assert(flags & HASH_SHARED_MEM);
3771 tgl 527 ECB :
528 : /*
529 : * The number of partitions had better be a power of 2. Also, it must
530 : * be less than INT_MAX (see init_htab()), so call the int version of
531 : * next_pow2.
532 : */
3771 tgl 533 GIC 9130 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
6105 tgl 534 ECB :
6105 tgl 535 GIC 9130 : hctl->num_partitions = info->num_partitions;
6105 tgl 536 ECB : }
537 :
9345 bruce 538 GIC 227592 : if (flags & HASH_SEGMENT)
9345 bruce 539 ECB : {
9345 bruce 540 UIC 0 : hctl->ssize = info->ssize;
9345 bruce 541 UBC 0 : hctl->sshift = my_log2(info->ssize);
8714 tgl 542 EUB : /* ssize had better be a power of 2 */
8714 tgl 543 UIC 0 : Assert(hctl->ssize == (1L << hctl->sshift));
9345 bruce 544 EUB : }
545 :
546 : /*
547 : * SHM hash tables have fixed directory size passed by the caller.
548 : */
9345 bruce 549 GIC 227592 : if (flags & HASH_DIRSIZE)
9345 bruce 550 ECB : {
8812 tgl 551 GIC 12785 : hctl->max_dsize = info->max_dsize;
8812 tgl 552 CBC 12785 : hctl->dsize = info->dsize;
9345 bruce 553 ECB : }
554 :
555 : /* remember the entry sizes, too */
845 tgl 556 GIC 227592 : hctl->keysize = info->keysize;
845 tgl 557 CBC 227592 : hctl->entrysize = info->entrysize;
7860 tgl 558 ECB :
559 : /* make local copies of heavily-used constant fields */
6105 tgl 560 GIC 227592 : hashp->keysize = hctl->keysize;
6105 tgl 561 CBC 227592 : hashp->ssize = hctl->ssize;
562 227592 : hashp->sshift = hctl->sshift;
6105 tgl 563 ECB :
564 : /* Build the hash directory structure */
7860 tgl 565 GIC 227592 : if (!init_htab(hashp, nelem))
5689 tgl 566 LBC 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
6767 tgl 567 EUB :
568 : /*
569 : * For a shared hash table, preallocate the requested number of elements.
570 : * This reduces problems with run-time out-of-shared-memory conditions.
571 : *
572 : * For a non-shared hash table, preallocate the requested number of
573 : * elements if it's less than our chosen nelem_alloc. This avoids wasting
574 : * space if the caller correctly estimates a small table size.
575 : */
6132 tgl 576 GIC 227592 : if ((flags & HASH_SHARED_MEM) ||
6132 tgl 577 CBC 214807 : nelem < hctl->nelem_alloc)
6767 tgl 578 ECB : {
579 : int i,
580 : freelist_partitions,
581 : nelem_alloc,
582 : nelem_alloc_first;
583 :
584 : /*
585 : * If hash table is partitioned, give each freelist an equal share of
586 : * the initial allocation. Otherwise only freeList[0] is used.
587 : */
2573 rhaas 588 GIC 75292 : if (IS_PARTITIONED(hashp->hctl))
2573 rhaas 589 CBC 9130 : freelist_partitions = NUM_FREELISTS;
2573 rhaas 590 ECB : else
2573 rhaas 591 GIC 66162 : freelist_partitions = 1;
2573 rhaas 592 ECB :
2573 rhaas 593 GIC 75292 : nelem_alloc = nelem / freelist_partitions;
2087 tgl 594 CBC 75292 : if (nelem_alloc <= 0)
2573 rhaas 595 LBC 0 : nelem_alloc = 1;
2573 rhaas 596 EUB :
597 : /*
598 : * Make sure we'll allocate all the requested elements; freeList[0]
599 : * gets the excess if the request isn't divisible by NUM_FREELISTS.
600 : */
2573 rhaas 601 GIC 75292 : if (nelem_alloc * freelist_partitions < nelem)
2573 rhaas 602 CBC 307 : nelem_alloc_first =
603 307 : nelem - nelem_alloc * (freelist_partitions - 1);
2573 rhaas 604 ECB : else
2573 rhaas 605 GIC 74985 : nelem_alloc_first = nelem_alloc;
2573 rhaas 606 ECB :
2573 rhaas 607 GIC 433614 : for (i = 0; i < freelist_partitions; i++)
2573 rhaas 608 ECB : {
2573 rhaas 609 GIC 358322 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
2573 rhaas 610 ECB :
2573 rhaas 611 GIC 358322 : if (!element_alloc(hashp, temp, i))
2573 rhaas 612 LBC 0 : ereport(ERROR,
2573 rhaas 613 EUB : (errcode(ERRCODE_OUT_OF_MEMORY),
614 : errmsg("out of memory")));
615 : }
616 : }
617 :
4381 heikki.linnakangas 618 GIC 227592 : if (flags & HASH_FIXED_SIZE)
4381 heikki.linnakangas 619 CBC 5478 : hashp->isfixed = true;
8986 bruce 620 227592 : return hashp;
9770 scrappy 621 ECB : }
622 :
623 : /*
624 : * Set default HASHHDR parameters.
625 : */
626 : static void
9344 bruce 627 GIC 227592 : hdefault(HTAB *hashp)
9770 scrappy 628 ECB : {
7836 bruce 629 GIC 227592 : HASHHDR *hctl = hashp->hctl;
9345 bruce 630 ECB :
7860 tgl 631 GIC 24352344 : MemSet(hctl, 0, sizeof(HASHHDR));
9345 bruce 632 ECB :
6105 tgl 633 GIC 227592 : hctl->dsize = DEF_DIRSIZE;
9345 bruce 634 CBC 227592 : hctl->nsegs = 0;
9345 bruce 635 ECB :
6105 tgl 636 GIC 227592 : hctl->num_partitions = 0; /* not partitioned */
6105 tgl 637 ECB :
638 : /* table has no fixed maximum size */
9345 bruce 639 GIC 227592 : hctl->max_dsize = NO_MAX_DSIZE;
9345 bruce 640 ECB :
6105 tgl 641 GIC 227592 : hctl->ssize = DEF_SEGSIZE;
6105 tgl 642 CBC 227592 : hctl->sshift = DEF_SEGSIZE_SHIFT;
6105 tgl 643 ECB :
644 : #ifdef HASH_STATISTICS
645 : hctl->accesses = hctl->collisions = 0;
646 : #endif
9770 scrappy 647 GIC 227592 : }
9770 scrappy 648 ECB :
649 : /*
650 : * Given the user-specified entry size, choose nelem_alloc, ie, how many
651 : * elements to add to the hash table when we need more.
652 : */
653 : static int
6132 tgl 654 GIC 246761 : choose_nelem_alloc(Size entrysize)
6132 tgl 655 ECB : {
656 : int nelem_alloc;
657 : Size elementSize;
658 : Size allocSize;
659 :
660 : /* Each element has a HASHELEMENT header plus user data. */
661 : /* NB: this had better match element_alloc() */
6132 tgl 662 GIC 246761 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
6132 tgl 663 ECB :
664 : /*
665 : * The idea here is to choose nelem_alloc at least 32, but round up so
666 : * that the allocation request will be a power of 2 or just less. This
667 : * makes little difference for hash tables in shared memory, but for hash
668 : * tables managed by palloc, the allocation request will be rounded up to
669 : * a power of 2 anyway. If we fail to take this into account, we'll waste
670 : * as much as half the allocated space.
671 : */
6132 tgl 672 GIC 246761 : allocSize = 32 * 4; /* assume elementSize at least 8 */
6031 bruce 673 ECB : do
674 : {
6132 tgl 675 GIC 978326 : allocSize <<= 1;
6132 tgl 676 CBC 978326 : nelem_alloc = allocSize / elementSize;
677 978326 : } while (nelem_alloc < 32);
6132 tgl 678 ECB :
6132 tgl 679 GIC 246761 : return nelem_alloc;
6132 tgl 680 ECB : }
681 :
682 : /*
683 : * Compute derived fields of hctl and build the initial directory/segment
684 : * arrays
685 : */
686 : static bool
7860 tgl 687 GIC 227592 : init_htab(HTAB *hashp, long nelem)
9770 scrappy 688 ECB : {
7836 bruce 689 GIC 227592 : HASHHDR *hctl = hashp->hctl;
7860 tgl 690 ECB : HASHSEGMENT *segp;
691 : int nbuckets;
692 : int nsegs;
693 : int i;
694 :
695 : /*
696 : * initialize mutexes if it's a partitioned table
697 : */
6105 tgl 698 GIC 227592 : if (IS_PARTITIONED(hctl))
2573 rhaas 699 CBC 301290 : for (i = 0; i < NUM_FREELISTS; i++)
700 292160 : SpinLockInit(&(hctl->freeList[i].mutex));
6105 tgl 701 ECB :
702 : /*
703 : * Allocate space for the next greater power of two number of buckets,
704 : * assuming a desired maximum load factor of 1.
705 : */
932 tmunro 706 GIC 227592 : nbuckets = next_pow2_int(nelem);
9345 bruce 707 ECB :
708 : /*
709 : * In a partitioned table, nbuckets must be at least equal to
710 : * num_partitions; were it less, keys with apparently different partition
711 : * numbers would map to the same bucket, breaking partition independence.
712 : * (Normally nbuckets will be much bigger; this is just a safety check.)
713 : */
6105 tgl 714 GIC 227592 : while (nbuckets < hctl->num_partitions)
6105 tgl 715 LBC 0 : nbuckets <<= 1;
6105 tgl 716 EUB :
9345 bruce 717 GIC 227592 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
9345 bruce 718 CBC 227592 : hctl->high_mask = (nbuckets << 1) - 1;
9345 bruce 719 ECB :
720 : /*
721 : * Figure number of directory segments needed, round up to a power of 2
722 : */
9345 bruce 723 GIC 227592 : nsegs = (nbuckets - 1) / hctl->ssize + 1;
3771 tgl 724 CBC 227592 : nsegs = next_pow2_int(nsegs);
9345 bruce 725 ECB :
726 : /*
727 : * Make sure directory is big enough. If pre-allocated directory is too
728 : * small, choke (caller screwed up).
729 : */
8812 tgl 730 GIC 227592 : if (nsegs > hctl->dsize)
8812 tgl 731 ECB : {
8812 tgl 732 UIC 0 : if (!(hashp->dir))
8812 tgl 733 UBC 0 : hctl->dsize = nsegs;
8812 tgl 734 EUB : else
7860 tgl 735 UIC 0 : return false;
8812 tgl 736 EUB : }
737 :
738 : /* Allocate a directory */
9345 bruce 739 GIC 227592 : if (!(hashp->dir))
9345 bruce 740 ECB : {
7961 JanWieck 741 GIC 214807 : CurrentDynaHashCxt = hashp->hcxt;
7860 tgl 742 CBC 214807 : hashp->dir = (HASHSEGMENT *)
743 214807 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
9345 bruce 744 214807 : if (!hashp->dir)
7860 tgl 745 LBC 0 : return false;
9345 bruce 746 EUB : }
747 :
748 : /* Allocate initial segments */
9345 bruce 749 GIC 892240 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
9345 bruce 750 ECB : {
9345 bruce 751 GIC 664648 : *segp = seg_alloc(hashp);
7860 tgl 752 CBC 664648 : if (*segp == NULL)
7860 tgl 753 LBC 0 : return false;
9770 scrappy 754 EUB : }
755 :
756 : /* Choose number of entries to allocate at a time */
6132 tgl 757 GIC 227592 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
6496 tgl 758 ECB :
759 : #ifdef HASH_DEBUG
760 : fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
761 : "TABLE POINTER ", hashp,
762 : "DIRECTORY SIZE ", hctl->dsize,
763 : "SEGMENT SIZE ", hctl->ssize,
764 : "SEGMENT SHIFT ", hctl->sshift,
765 : "MAX BUCKET ", hctl->max_bucket,
766 : "HIGH MASK ", hctl->high_mask,
767 : "LOW MASK ", hctl->low_mask,
768 : "NSEGS ", hctl->nsegs);
769 : #endif
7860 tgl 770 GIC 227592 : return true;
9770 scrappy 771 ECB : }
772 :
773 : /*
774 : * Estimate the space needed for a hashtable containing the given number
775 : * of entries of given size.
776 : * NOTE: this is used to estimate the footprint of hashtables in shared
777 : * memory; therefore it does not count HTAB which is in local memory.
778 : * NB: assumes that all hash structure parameters have default values!
779 : */
780 : Size
7173 tgl 781 GIC 19169 : hash_estimate_size(long num_entries, Size entrysize)
8812 tgl 782 ECB : {
783 : Size size;
784 : long nBuckets,
785 : nSegments,
786 : nDirEntries,
787 : nElementAllocs,
788 : elementSize,
789 : elementAllocCnt;
790 :
791 : /* estimate number of buckets wanted */
932 tmunro 792 GIC 19169 : nBuckets = next_pow2_long(num_entries);
8812 tgl 793 ECB : /* # of segments needed for nBuckets */
3771 tgl 794 GIC 19169 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
8800 tgl 795 ECB : /* directory entries */
8800 tgl 796 GIC 19169 : nDirEntries = DEF_DIRSIZE;
8800 tgl 797 CBC 19169 : while (nDirEntries < nSegments)
8800 tgl 798 LBC 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
8812 tgl 799 EUB :
800 : /* fixed control info */
6441 tgl 801 GIC 19169 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
8800 tgl 802 ECB : /* directory */
6441 tgl 803 GIC 19169 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
8812 tgl 804 ECB : /* segments */
6441 tgl 805 GIC 19169 : size = add_size(size, mul_size(nSegments,
2118 tgl 806 ECB : MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
807 : /* elements --- allocated in groups of choose_nelem_alloc() entries */
6132 tgl 808 GIC 19169 : elementAllocCnt = choose_nelem_alloc(entrysize);
6496 tgl 809 CBC 19169 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
6132 810 19169 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
6441 811 19169 : size = add_size(size,
6441 tgl 812 ECB : mul_size(nElementAllocs,
813 : mul_size(elementAllocCnt, elementSize)));
814 :
8812 tgl 815 GIC 19169 : return size;
8812 tgl 816 ECB : }
817 :
818 : /*
819 : * Select an appropriate directory size for a hashtable with the given
820 : * maximum number of entries.
821 : * This is only needed for hashtables in shared memory, whose directories
822 : * cannot be expanded dynamically.
823 : * NB: assumes that all hash structure parameters have default values!
824 : *
825 : * XXX this had better agree with the behavior of init_htab()...
826 : */
827 : long
8443 tgl 828 GIC 12785 : hash_select_dirsize(long num_entries)
8443 tgl 829 ECB : {
830 : long nBuckets,
831 : nSegments,
832 : nDirEntries;
833 :
834 : /* estimate number of buckets wanted */
932 tmunro 835 GIC 12785 : nBuckets = next_pow2_long(num_entries);
8443 tgl 836 ECB : /* # of segments needed for nBuckets */
3771 tgl 837 GIC 12785 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
8443 tgl 838 ECB : /* directory entries */
8443 tgl 839 GIC 12785 : nDirEntries = DEF_DIRSIZE;
8443 tgl 840 CBC 12785 : while (nDirEntries < nSegments)
8443 tgl 841 LBC 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
8443 tgl 842 EUB :
8443 tgl 843 GIC 12785 : return nDirEntries;
8443 tgl 844 ECB : }
845 :
846 : /*
847 : * Compute the required initial memory allocation for a shared-memory
848 : * hashtable with the given parameters. We need space for the HASHHDR
849 : * and for the (non expansible) directory.
850 : */
851 : Size
6105 tgl 852 GIC 12785 : hash_get_shared_size(HASHCTL *info, int flags)
6105 tgl 853 ECB : {
6105 tgl 854 GIC 12785 : Assert(flags & HASH_DIRSIZE);
6105 tgl 855 CBC 12785 : Assert(info->dsize == info->max_dsize);
856 12785 : return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
6105 tgl 857 ECB : }
858 :
859 :
860 : /********************** DESTROY ROUTINES ************************/
861 :
862 : void
9344 bruce 863 GIC 38331 : hash_destroy(HTAB *hashp)
9770 scrappy 864 ECB : {
9345 bruce 865 GIC 38331 : if (hashp != NULL)
9345 bruce 866 ECB : {
867 : /* allocation method must be one we know how to free, too */
6547 tgl 868 GIC 38331 : Assert(hashp->alloc == DynaHashAlloc);
1451 michael 869 ECB : /* so this hashtable must have its own context */
7961 JanWieck 870 GIC 38331 : Assert(hashp->hcxt != NULL);
8800 tgl 871 ECB :
8800 tgl 872 GIC 38331 : hash_stats("destroy", hashp);
8800 tgl 873 ECB :
874 : /*
875 : * Free everything by destroying the hash table's memory context.
876 : */
7961 JanWieck 877 GIC 38331 : MemoryContextDelete(hashp->hcxt);
9770 scrappy 878 ECB : }
9770 scrappy 879 GIC 38331 : }
9770 scrappy 880 ECB :
881 : void
7856 tgl 882 GIC 38331 : hash_stats(const char *where, HTAB *hashp)
9770 scrappy 883 ECB : {
884 : #ifdef HASH_STATISTICS
885 : fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
886 : where, hashp->hctl->accesses, hashp->hctl->collisions);
887 :
888 : fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
889 : hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
890 : hashp->hctl->max_bucket, hashp->hctl->nsegs);
891 : fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
892 : where, hash_accesses, hash_collisions);
893 : fprintf(stderr, "hash_stats: total expansions %ld\n",
894 : hash_expansions);
895 : #endif
9770 scrappy 896 GIC 38331 : }
9770 scrappy 897 ECB :
898 : /*******************************SEARCH ROUTINES *****************************/
899 :
900 :
901 : /*
902 : * get_hash_value -- exported routine to calculate a key's hash value
903 : *
904 : * We export this because for partitioned tables, callers need to compute
905 : * the partition number (from the low-order bits of the hash value) before
906 : * searching.
907 : */
908 : uint32
6105 tgl 909 GIC 99338887 : get_hash_value(HTAB *hashp, const void *keyPtr)
6105 tgl 910 ECB : {
6105 tgl 911 GIC 99338887 : return hashp->hash(keyPtr, hashp->keysize);
6105 tgl 912 ECB : }
913 :
914 : /* Convert a hash value to a bucket number */
915 : static inline uint32
7173 tgl 916 GIC 232941995 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
7173 tgl 917 ECB : {
918 : uint32 bucket;
919 :
9345 bruce 920 GIC 232941995 : bucket = hash_val & hctl->high_mask;
9345 bruce 921 CBC 232941995 : if (bucket > hctl->max_bucket)
922 105550962 : bucket = bucket & hctl->low_mask;
9345 bruce 923 ECB :
7701 tgl 924 GIC 232941995 : return bucket;
9770 scrappy 925 ECB : }
926 :
927 : /*
928 : * hash_search -- look up key in table and perform action
929 : * hash_search_with_hash_value -- same, with key's hash value already computed
930 : *
931 : * action is one of:
932 : * HASH_FIND: look up key in table
933 : * HASH_ENTER: look up key in table, creating entry if not present
934 : * HASH_ENTER_NULL: same, but return NULL if out of memory
935 : * HASH_REMOVE: look up key in table, remove entry if present
936 : *
937 : * Return value is a pointer to the element found/entered/removed if any,
938 : * or NULL if no match was found. (NB: in the case of the REMOVE action,
939 : * the result is a dangling pointer that shouldn't be dereferenced!)
940 : *
941 : * HASH_ENTER will normally ereport a generic "out of memory" error if
942 : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
943 : * the same except it will return NULL if out of memory.
944 : *
945 : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
946 : * existing entry in the table, false otherwise. This is needed in the
947 : * HASH_ENTER case, but is redundant with the return value otherwise.
948 : *
949 : * For hash_search_with_hash_value, the hashvalue parameter must have been
950 : * calculated with get_hash_value().
951 : */
7860 tgl 952 : void *
9344 bruce 953 GIC 145036780 : hash_search(HTAB *hashp,
954 : const void *keyPtr,
955 : HASHACTION action,
956 : bool *foundPtr)
6105 tgl 957 ECB : {
6105 tgl 958 GIC 145036780 : return hash_search_with_hash_value(hashp,
6105 tgl 959 ECB : keyPtr,
6105 tgl 960 GIC 145036780 : hashp->hash(keyPtr, hashp->keysize),
961 : action,
962 : foundPtr);
963 : }
964 :
6105 tgl 965 ECB : void *
6105 tgl 966 GIC 232374870 : hash_search_with_hash_value(HTAB *hashp,
967 : const void *keyPtr,
968 : uint32 hashvalue,
969 : HASHACTION action,
970 : bool *foundPtr)
9770 scrappy 971 ECB : {
7836 bruce 972 CBC 232374870 : HASHHDR *hctl = hashp->hctl;
2087 tgl 973 GIC 232374870 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
974 : Size keysize;
975 : uint32 bucket;
976 : long segment_num;
977 : long segment_ndx;
978 : HASHSEGMENT segp;
979 : HASHBUCKET currBucket;
980 : HASHBUCKET *prevBucketPtr;
981 : HashCompareFunc match;
982 :
983 : #ifdef HASH_STATISTICS
984 : hash_accesses++;
985 : hctl->accesses++;
986 : #endif
987 :
988 : /*
989 : * If inserting, check if it is time to split a bucket.
990 : *
991 : * NOTE: failure to expand table is not a fatal error, it just means we
992 : * have to run at higher fill factor than we wanted. However, if we're
993 : * using the palloc allocator then it will throw error anyway on
994 : * out-of-memory, so we must do this before modifying the table.
3824 tgl 995 ECB : */
3824 tgl 996 GIC 232374870 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
997 : {
998 : /*
999 : * Can't split if running in partitioned mode, nor if frozen, nor if
1000 : * table is the subject of any active hash_seq_search scans.
3824 tgl 1001 ECB : */
932 tmunro 1002 CBC 55090909 : if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
1003 407775 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
3824 tgl 1004 407775 : !has_seq_scans(hashp))
3824 tgl 1005 GIC 407775 : (void) expand_table(hashp);
1006 : }
1007 :
1008 : /*
1009 : * Do the initial lookup
7856 tgl 1010 ECB : */
6524 tgl 1011 GIC 232374870 : bucket = calc_bucket(hctl, hashvalue);
9345 bruce 1012 ECB :
6105 tgl 1013 CBC 232374870 : segment_num = bucket >> hashp->sshift;
6105 tgl 1014 GIC 232374870 : segment_ndx = MOD(bucket, hashp->ssize);
7173 tgl 1015 ECB :
6524 tgl 1016 GIC 232374870 : segp = hashp->dir[segment_num];
7173 tgl 1017 ECB :
6524 tgl 1018 GBC 232374870 : if (segp == NULL)
6524 tgl 1019 UIC 0 : hash_corrupted(hashp);
9345 bruce 1020 ECB :
6524 tgl 1021 CBC 232374870 : prevBucketPtr = &segp[segment_ndx];
6524 tgl 1022 GIC 232374870 : currBucket = *prevBucketPtr;
1023 :
1024 : /*
1025 : * Follow collision chain looking for matching key
6524 tgl 1026 ECB : */
6524 tgl 1027 CBC 232374870 : match = hashp->match; /* save one fetch in inner loop */
6105 tgl 1028 GIC 232374870 : keysize = hashp->keysize; /* ditto */
9345 bruce 1029 ECB :
6524 tgl 1030 GIC 276189852 : while (currBucket != NULL)
6524 tgl 1031 ECB : {
6524 tgl 1032 CBC 409068297 : if (currBucket->hashvalue == hashvalue &&
1033 182629469 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1034 182623846 : break;
1035 43814982 : prevBucketPtr = &(currBucket->link);
7860 tgl 1036 GIC 43814982 : currBucket = *prevBucketPtr;
1037 : #ifdef HASH_STATISTICS
1038 : hash_collisions++;
1039 : hctl->collisions++;
1040 : #endif
1041 : }
9345 bruce 1042 ECB :
7856 tgl 1043 CBC 232374870 : if (foundPtr)
7856 tgl 1044 GIC 56278453 : *foundPtr = (bool) (currBucket != NULL);
1045 :
1046 : /*
1047 : * OK, now what?
9345 bruce 1048 ECB : */
9345 bruce 1049 GIC 232374870 : switch (action)
9345 bruce 1050 ECB : {
7856 tgl 1051 CBC 142500382 : case HASH_FIND:
7860 1052 142500382 : if (currBucket != NULL)
1053 131844261 : return (void *) ELEMENTKEY(currBucket);
7856 tgl 1054 GIC 10656121 : return NULL;
7856 tgl 1055 ECB :
9344 bruce 1056 CBC 34783579 : case HASH_REMOVE:
7860 tgl 1057 GIC 34783579 : if (currBucket != NULL)
1058 : {
6105 tgl 1059 ECB : /* if partitioned, must lock to touch nentries and freeList */
2732 rhaas 1060 CBC 34781505 : if (IS_PARTITIONED(hctl))
2573 rhaas 1061 GIC 6527866 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1062 :
2087 tgl 1063 ECB : /* delete the record from the appropriate nentries counter. */
2573 rhaas 1064 CBC 34781505 : Assert(hctl->freeList[freelist_idx].nentries > 0);
2573 rhaas 1065 GIC 34781505 : hctl->freeList[freelist_idx].nentries--;
1066 :
8714 tgl 1067 ECB : /* remove record from hash bucket's chain. */
7860 tgl 1068 GIC 34781505 : *prevBucketPtr = currBucket->link;
1069 :
2087 tgl 1070 ECB : /* add the record to the appropriate freelist. */
2573 rhaas 1071 CBC 34781505 : currBucket->link = hctl->freeList[freelist_idx].freeList;
2573 rhaas 1072 GIC 34781505 : hctl->freeList[freelist_idx].freeList = currBucket;
6105 tgl 1073 ECB :
2732 rhaas 1074 CBC 34781505 : if (IS_PARTITIONED(hctl))
2573 rhaas 1075 GIC 6527866 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1076 :
1077 : /*
1078 : * better hope the caller is synchronizing access to this
1079 : * element, because someone else is going to reuse it the next
1080 : * time something is added to the table
9344 bruce 1081 ECB : */
7860 tgl 1082 GIC 34781505 : return (void *) ELEMENTKEY(currBucket);
9344 bruce 1083 ECB : }
7856 tgl 1084 GIC 2074 : return NULL;
7860 tgl 1085 ECB :
7856 tgl 1086 GIC 55090909 : case HASH_ENTER:
1087 : case HASH_ENTER_NULL:
7856 tgl 1088 ECB : /* Return existing element if found, else create one */
7860 tgl 1089 GBC 55090909 : if (currBucket != NULL)
7860 tgl 1090 GIC 15998080 : return (void *) ELEMENTKEY(currBucket);
1091 :
5827 tgl 1092 ECB : /* disallow inserts if frozen */
5827 tgl 1093 CBC 39092829 : if (hashp->frozen)
5689 tgl 1094 UIC 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1095 : hashp->tabname);
5827 tgl 1096 EUB :
2573 rhaas 1097 GBC 39092829 : currBucket = get_hash_entry(hashp, freelist_idx);
7856 tgl 1098 GIC 39092829 : if (currBucket == NULL)
9344 bruce 1099 EUB : {
6105 tgl 1100 : /* out of memory */
6105 tgl 1101 UIC 0 : if (action == HASH_ENTER_NULL)
1102 0 : return NULL;
1103 : /* report a generic message */
6105 tgl 1104 UBC 0 : if (hashp->isshared)
6105 tgl 1105 UIC 0 : ereport(ERROR,
1106 : (errcode(ERRCODE_OUT_OF_MEMORY),
1107 : errmsg("out of shared memory")));
1108 : else
1109 0 : ereport(ERROR,
6105 tgl 1110 ECB : (errcode(ERRCODE_OUT_OF_MEMORY),
1111 : errmsg("out of memory")));
1112 : }
1113 :
7856 1114 : /* link into hashbucket chain */
7856 tgl 1115 CBC 39092829 : *prevBucketPtr = currBucket;
7856 tgl 1116 GIC 39092829 : currBucket->link = NULL;
1117 :
1118 : /* copy key into record */
7173 1119 39092829 : currBucket->hashvalue = hashvalue;
6504 1120 39092829 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1121 :
1122 : /*
1123 : * Caller is expected to fill the data field on return. DO NOT
3824 tgl 1124 ECB : * insert any code that could possibly throw error here, as doing
1125 : * so would leave the table entry incomplete and hence corrupt the
1126 : * caller's data structure.
5827 tgl 1127 EUB : */
1128 :
7856 tgl 1129 GIC 39092829 : return (void *) ELEMENTKEY(currBucket);
1130 : }
1131 :
7198 tgl 1132 UIC 0 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1133 :
1134 : return NULL; /* keep compiler quiet */
1135 : }
1136 :
1137 : /*
1138 : * hash_update_hash_key -- change the hash key of an existing table entry
1139 : *
1140 : * This is equivalent to removing the entry, making a new entry, and copying
1141 : * over its data, except that the entry never goes to the table's freelist.
1142 : * Therefore this cannot suffer an out-of-memory failure, even if there are
1143 : * other processes operating in other partitions of the hashtable.
1144 : *
1145 : * Returns true if successful, false if the requested new hash key is already
1146 : * present. Throws error if the specified entry pointer isn't actually a
1147 : * table member.
1148 : *
1149 : * NB: currently, there is no special case for old and new hash keys being
1150 : * identical, which means we'll report false for that situation. This is
1151 : * preferable for existing uses.
3738 tgl 1152 ECB : *
1153 : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1154 : * partitions, if the new hash key would belong to a different partition.
1155 : */
1156 : bool
3738 tgl 1157 CBC 807 : hash_update_hash_key(HTAB *hashp,
1158 : void *existingEntry,
1159 : const void *newKeyPtr)
1160 : {
3738 tgl 1161 GIC 807 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1162 807 : HASHHDR *hctl = hashp->hctl;
1163 : uint32 newhashvalue;
1164 : Size keysize;
1165 : uint32 bucket;
1166 : uint32 newbucket;
1167 : long segment_num;
1168 : long segment_ndx;
1169 : HASHSEGMENT segp;
1170 : HASHBUCKET currBucket;
1171 : HASHBUCKET *prevBucketPtr;
1172 : HASHBUCKET *oldPrevPtr;
1173 : HashCompareFunc match;
1174 :
1175 : #ifdef HASH_STATISTICS
3738 tgl 1176 ECB : hash_accesses++;
3738 tgl 1177 EUB : hctl->accesses++;
1178 : #endif
1179 :
1180 : /* disallow updates if frozen */
3738 tgl 1181 GIC 807 : if (hashp->frozen)
3738 tgl 1182 UIC 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1183 : hashp->tabname);
1184 :
3738 tgl 1185 ECB : /*
1186 : * Lookup the existing element using its saved hash value. We need to do
3602 bruce 1187 : * this to be able to unlink it from its hash chain, but as a side benefit
1188 : * we can verify the validity of the passed existingEntry pointer.
1189 : */
3738 tgl 1190 CBC 807 : bucket = calc_bucket(hctl, existingElement->hashvalue);
1191 :
1192 807 : segment_num = bucket >> hashp->sshift;
3738 tgl 1193 GBC 807 : segment_ndx = MOD(bucket, hashp->ssize);
1194 :
3738 tgl 1195 CBC 807 : segp = hashp->dir[segment_num];
3738 tgl 1196 ECB :
3738 tgl 1197 GIC 807 : if (segp == NULL)
3738 tgl 1198 LBC 0 : hash_corrupted(hashp);
1199 :
3738 tgl 1200 CBC 807 : prevBucketPtr = &segp[segment_ndx];
1201 807 : currBucket = *prevBucketPtr;
3738 tgl 1202 ECB :
3738 tgl 1203 CBC 834 : while (currBucket != NULL)
1204 : {
3738 tgl 1205 GIC 834 : if (currBucket == existingElement)
3738 tgl 1206 CBC 807 : break;
3738 tgl 1207 GBC 27 : prevBucketPtr = &(currBucket->link);
3738 tgl 1208 GIC 27 : currBucket = *prevBucketPtr;
1209 : }
3738 tgl 1210 ECB :
3738 tgl 1211 GIC 807 : if (currBucket == NULL)
3738 tgl 1212 UIC 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1213 : hashp->tabname);
1214 :
3738 tgl 1215 GIC 807 : oldPrevPtr = prevBucketPtr;
3738 tgl 1216 ECB :
1217 : /*
3602 bruce 1218 : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1219 : * chain we want to put the entry into.
3738 tgl 1220 : */
3738 tgl 1221 CBC 807 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1222 :
3737 1223 807 : newbucket = calc_bucket(hctl, newhashvalue);
1224 :
1225 807 : segment_num = newbucket >> hashp->sshift;
3737 tgl 1226 GBC 807 : segment_ndx = MOD(newbucket, hashp->ssize);
1227 :
3738 tgl 1228 CBC 807 : segp = hashp->dir[segment_num];
3738 tgl 1229 ECB :
3738 tgl 1230 GIC 807 : if (segp == NULL)
3738 tgl 1231 UIC 0 : hash_corrupted(hashp);
1232 :
3738 tgl 1233 GIC 807 : prevBucketPtr = &segp[segment_ndx];
3738 tgl 1234 CBC 807 : currBucket = *prevBucketPtr;
3738 tgl 1235 ECB :
1236 : /*
1237 : * Follow collision chain looking for matching key
1238 : */
3738 tgl 1239 CBC 807 : match = hashp->match; /* save one fetch in inner loop */
3738 tgl 1240 GBC 807 : keysize = hashp->keysize; /* ditto */
3738 tgl 1241 EUB :
3738 tgl 1242 CBC 916 : while (currBucket != NULL)
3738 tgl 1243 ECB : {
3738 tgl 1244 GIC 109 : if (currBucket->hashvalue == newhashvalue &&
3738 tgl 1245 UIC 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1246 0 : break;
3738 tgl 1247 GIC 109 : prevBucketPtr = &(currBucket->link);
1248 109 : currBucket = *prevBucketPtr;
1249 : #ifdef HASH_STATISTICS
3738 tgl 1250 ECB : hash_collisions++;
3738 tgl 1251 EUB : hctl->collisions++;
1252 : #endif
3738 tgl 1253 ECB : }
1254 :
3738 tgl 1255 GIC 807 : if (currBucket != NULL)
3738 tgl 1256 UIC 0 : return false; /* collision with an existing entry */
1257 :
3738 tgl 1258 GIC 807 : currBucket = existingElement;
1259 :
1260 : /*
1261 : * If old and new hash values belong to the same bucket, we need not
3737 tgl 1262 ECB : * change any chain links, and indeed should not since this simplistic
1263 : * update will corrupt the list if currBucket is the last element. (We
1264 : * cannot fall out earlier, however, since we need to scan the bucket to
1265 : * check for duplicate keys.)
1266 : */
3737 tgl 1267 GIC 807 : if (bucket != newbucket)
3737 tgl 1268 ECB : {
1269 : /* OK to remove record from old hash bucket's chain. */
3737 tgl 1270 GIC 739 : *oldPrevPtr = currBucket->link;
1271 :
1272 : /* link into new hashbucket chain */
3737 tgl 1273 CBC 739 : *prevBucketPtr = currBucket;
1274 739 : currBucket->link = NULL;
1275 : }
1276 :
1277 : /* copy new key into record */
3738 1278 807 : currBucket->hashvalue = newhashvalue;
3738 tgl 1279 GIC 807 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1280 :
1281 : /* rest of record is untouched */
1282 :
1283 807 : return true;
1284 : }
1285 :
1286 : /*
2087 tgl 1287 ECB : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1288 : * (Or, if the underlying space allocator throws error for out-of-memory,
1289 : * we won't return at all.)
1290 : */
1291 : static HASHBUCKET
2573 rhaas 1292 GIC 39092829 : get_hash_entry(HTAB *hashp, int freelist_idx)
1293 : {
1294 39092829 : HASHHDR *hctl = hashp->hctl;
6105 tgl 1295 ECB : HASHBUCKET newElement;
1296 :
1297 : for (;;)
1298 : {
1299 : /* if partitioned, must lock to touch nentries and freeList */
2732 rhaas 1300 GIC 39314639 : if (IS_PARTITIONED(hctl))
2573 rhaas 1301 CBC 7267667 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
6105 tgl 1302 ECB :
1303 : /* try to get an entry from the freelist */
2573 rhaas 1304 CBC 39314639 : newElement = hctl->freeList[freelist_idx].freeList;
2573 rhaas 1305 ECB :
6105 tgl 1306 GIC 39314639 : if (newElement != NULL)
1307 39092829 : break;
1308 :
2732 rhaas 1309 221810 : if (IS_PARTITIONED(hctl))
2573 1310 1196 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1311 :
1312 : /*
1313 : * No free elements in this freelist. In a partitioned table, there
1314 : * might be entries in other freelists, but to reduce contention we
1315 : * prefer to first try to get another chunk of buckets from the main
1316 : * shmem allocator. If that fails, though, we *MUST* root through all
1317 : * the other freelists before giving up. There are multiple callers
1318 : * that assume that they can allocate every element in the initially
2087 tgl 1319 ECB : * requested table size, or that deleting an element guarantees they
1320 : * can insert a new element, even if shared memory is entirely full.
1321 : * Failing because the needed element is in a different freelist is
1322 : * not acceptable.
2087 tgl 1323 EUB : */
2573 rhaas 1324 GBC 221810 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1325 : {
1326 : int borrow_from_idx;
2087 tgl 1327 EUB :
2573 rhaas 1328 UIC 0 : if (!IS_PARTITIONED(hctl))
1329 0 : return NULL; /* out of memory */
2573 rhaas 1330 EUB :
2087 tgl 1331 : /* try to borrow element from another freelist */
2573 rhaas 1332 UBC 0 : borrow_from_idx = freelist_idx;
1333 : for (;;)
2573 rhaas 1334 EUB : {
2573 rhaas 1335 UBC 0 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
2573 rhaas 1336 UIC 0 : if (borrow_from_idx == freelist_idx)
2087 tgl 1337 UBC 0 : break; /* examined all freelists, fail */
1338 :
2573 rhaas 1339 0 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1340 0 : newElement = hctl->freeList[borrow_from_idx].freeList;
1341 :
2573 rhaas 1342 UIC 0 : if (newElement != NULL)
2573 rhaas 1343 EUB : {
2573 rhaas 1344 UBC 0 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1345 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1346 :
2087 tgl 1347 EUB : /* careful: count the new element in its proper freelist */
2573 rhaas 1348 UIC 0 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1349 0 : hctl->freeList[freelist_idx].nentries++;
2573 rhaas 1350 UBC 0 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1351 :
2087 tgl 1352 UIC 0 : return newElement;
1353 : }
2573 rhaas 1354 EUB :
2573 rhaas 1355 UIC 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1356 : }
1357 :
1358 : /* no elements available to borrow either, so out of memory */
2087 tgl 1359 LBC 0 : return NULL;
6105 tgl 1360 ECB : }
1361 : }
1362 :
1363 : /* remove entry from freelist, bump nentries */
2573 rhaas 1364 GIC 39092829 : hctl->freeList[freelist_idx].freeList = newElement->link;
2573 rhaas 1365 CBC 39092829 : hctl->freeList[freelist_idx].nentries++;
1366 :
2732 rhaas 1367 GIC 39092829 : if (IS_PARTITIONED(hctl))
2573 1368 7266471 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1369 :
6105 tgl 1370 39092829 : return newElement;
1371 : }
6105 tgl 1372 ECB :
1373 : /*
1374 : * hash_get_num_entries -- get the number of entries in a hashtable
1375 : */
1376 : long
6105 tgl 1377 GIC 34950 : hash_get_num_entries(HTAB *hashp)
1378 : {
1379 : int i;
2573 rhaas 1380 34950 : long sum = hashp->hctl->freeList[0].nentries;
1381 :
6105 tgl 1382 ECB : /*
1383 : * We currently don't bother with acquiring the mutexes; it's only
2087 1384 : * sensible to call this function if you've got lock on all partitions of
1385 : * the table.
1386 : */
2087 tgl 1387 GIC 34950 : if (IS_PARTITIONED(hashp->hctl))
2087 tgl 1388 ECB : {
2087 tgl 1389 GIC 72864 : for (i = 1; i < NUM_FREELISTS; i++)
1390 70587 : sum += hashp->hctl->freeList[i].nentries;
1391 : }
1392 :
2573 rhaas 1393 34950 : return sum;
1394 : }
1395 :
1396 : /*
1397 : * hash_seq_init/_search/_term
1398 : * Sequentially search through hash table and return
1399 : * all the elements one by one, return NULL when no more.
1400 : *
1401 : * hash_seq_term should be called if and only if the scan is abandoned before
1402 : * completion; if hash_seq_search returns NULL then it has already done the
1403 : * end-of-scan cleanup.
1404 : *
1405 : * NOTE: caller may delete the returned element before continuing the scan.
1406 : * However, deleting any other element while the scan is in progress is
1407 : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1408 : * if elements are added to the table while the scan is in progress, it is
1409 : * unspecified whether they will be visited by the scan or not.
1410 : *
1411 : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1412 : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1413 : * further insertions by calling hash_freeze.
1414 : *
1415 : * NOTE: to use this with a partitioned hashtable, caller had better hold
6105 tgl 1416 ECB : * at least shared lock on all partitions of the table throughout the scan!
1417 : * We can cope with insertions or deletions by our own backend, but *not*
5827 1418 : * with concurrent insertions or deletions by another.
9770 scrappy 1419 : */
8132 tgl 1420 : void
8132 tgl 1421 CBC 3509197 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
8132 tgl 1422 ECB : {
8132 tgl 1423 CBC 3509197 : status->hashp = hashp;
8132 tgl 1424 GIC 3509197 : status->curBucket = 0;
7860 1425 3509197 : status->curEntry = NULL;
5827 tgl 1426 CBC 3509197 : if (!hashp->frozen)
5827 tgl 1427 GIC 3509197 : register_seq_scan(hashp);
8132 1428 3509197 : }
1429 :
1430 : void *
1431 40463454 : hash_seq_search(HASH_SEQ_STATUS *status)
1432 : {
1433 : HTAB *hashp;
1434 : HASHHDR *hctl;
1435 : uint32 max_bucket;
1436 : long ssize;
1437 : long segment_num;
6547 tgl 1438 ECB : long segment_ndx;
1439 : HASHSEGMENT segp;
1440 : uint32 curBucket;
1441 : HASHELEMENT *curElem;
9345 bruce 1442 :
6547 tgl 1443 CBC 40463454 : if ((curElem = status->curEntry) != NULL)
9345 bruce 1444 ECB : {
1445 : /* Continuing scan of curBucket... */
6547 tgl 1446 GIC 8366635 : status->curEntry = curElem->link;
6385 bruce 1447 8366635 : if (status->curEntry == NULL) /* end of this bucket */
6547 tgl 1448 6431228 : ++status->curBucket;
1449 8366635 : return (void *) ELEMENTKEY(curElem);
6547 tgl 1450 ECB : }
9345 bruce 1451 :
6547 tgl 1452 : /*
1453 : * Search for next nonempty bucket starting at curBucket.
1454 : */
6547 tgl 1455 GIC 32096819 : curBucket = status->curBucket;
6547 tgl 1456 CBC 32096819 : hashp = status->hashp;
6547 tgl 1457 GIC 32096819 : hctl = hashp->hctl;
6105 tgl 1458 CBC 32096819 : ssize = hashp->ssize;
6547 1459 32096819 : max_bucket = hctl->max_bucket;
1460 :
6547 tgl 1461 GIC 32096819 : if (curBucket > max_bucket)
1462 : {
5827 1463 102550 : hash_seq_term(status);
6385 bruce 1464 102550 : return NULL; /* search is done */
5827 tgl 1465 ECB : }
9345 bruce 1466 :
1467 : /*
6547 tgl 1468 : * first find the right segment in the table directory.
1469 : */
6105 tgl 1470 GIC 31994269 : segment_num = curBucket >> hashp->sshift;
6547 1471 31994269 : segment_ndx = MOD(curBucket, ssize);
1472 :
1473 31994269 : segp = hashp->dir[segment_num];
1474 :
1475 : /*
6537 tgl 1476 ECB : * Pick up the first item in this bucket's chain. If chain is not empty
1477 : * we can begin searching it. Otherwise we have to advance to find the
1478 : * next nonempty bucket. We try to optimize that case since searching a
1479 : * near-empty hashtable has to iterate this loop a lot.
1480 : */
6547 tgl 1481 CBC 282858170 : while ((curElem = segp[segment_ndx]) == NULL)
6547 tgl 1482 ECB : {
1483 : /* empty bucket, advance to next */
6547 tgl 1484 GIC 254254810 : if (++curBucket > max_bucket)
6547 tgl 1485 ECB : {
6547 tgl 1486 GIC 3390909 : status->curBucket = curBucket;
5827 tgl 1487 CBC 3390909 : hash_seq_term(status);
6385 bruce 1488 3390909 : return NULL; /* search is done */
6547 tgl 1489 ECB : }
6547 tgl 1490 GIC 250863901 : if (++segment_ndx >= ssize)
1491 : {
1492 131483 : segment_num++;
1493 131483 : segment_ndx = 0;
6547 tgl 1494 CBC 131483 : segp = hashp->dir[segment_num];
6547 tgl 1495 ECB : }
9770 scrappy 1496 : }
9345 bruce 1497 :
6547 tgl 1498 : /* Begin scan of curBucket... */
6547 tgl 1499 GIC 28603360 : status->curEntry = curElem->link;
2118 1500 28603360 : if (status->curEntry == NULL) /* end of this bucket */
6547 1501 22172100 : ++curBucket;
6547 tgl 1502 CBC 28603360 : status->curBucket = curBucket;
6547 tgl 1503 GIC 28603360 : return (void *) ELEMENTKEY(curElem);
9770 scrappy 1504 ECB : }
1505 :
5827 tgl 1506 : void
5827 tgl 1507 GIC 3509187 : hash_seq_term(HASH_SEQ_STATUS *status)
1508 : {
1509 3509187 : if (!status->hashp->frozen)
1510 3509187 : deregister_seq_scan(status->hashp);
1511 3509187 : }
1512 :
1513 : /*
1514 : * hash_freeze
1515 : * Freeze a hashtable against future insertions (deletions are
1516 : * still allowed)
1517 : *
1518 : * The reason for doing this is that by preventing any more bucket splits,
1519 : * we no longer need to worry about registering hash_seq_search scans,
1520 : * and thus caller need not be careful about ensuring hash_seq_term gets
1521 : * called at the right times.
5827 tgl 1522 EUB : *
1523 : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1524 : * with active scans (since hash_seq_term would then do the wrong thing).
1525 : */
1526 : void
5827 tgl 1527 UBC 0 : hash_freeze(HTAB *hashp)
1528 : {
1529 0 : if (hashp->isshared)
5689 1530 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
5827 tgl 1531 UIC 0 : if (!hashp->frozen && has_seq_scans(hashp))
5689 1532 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1533 : hashp->tabname);
5827 1534 0 : hashp->frozen = true;
1535 0 : }
1536 :
1537 :
1538 : /********************************* UTILITIES ************************/
8714 tgl 1539 ECB :
1540 : /*
1541 : * Expand the table by adding one more hash bucket.
1542 : */
1543 : static bool
9344 bruce 1544 GIC 407775 : expand_table(HTAB *hashp)
1545 : {
7836 1546 407775 : HASHHDR *hctl = hashp->hctl;
1547 : HASHSEGMENT old_seg,
1548 : new_seg;
1549 : long old_bucket,
1550 : new_bucket;
1551 : long new_segnum,
1552 : new_segndx;
1553 : long old_segnum,
1554 : old_segndx;
7860 tgl 1555 ECB : HASHBUCKET *oldlink,
1556 : *newlink;
1557 : HASHBUCKET currElement,
1558 : nextElement;
1559 :
6105 tgl 1560 GIC 407775 : Assert(!IS_PARTITIONED(hctl));
6105 tgl 1561 ECB :
9770 scrappy 1562 : #ifdef HASH_STATISTICS
9345 bruce 1563 : hash_expansions++;
1564 : #endif
1565 :
8812 tgl 1566 GIC 407775 : new_bucket = hctl->max_bucket + 1;
6105 1567 407775 : new_segnum = new_bucket >> hashp->sshift;
6105 tgl 1568 CBC 407775 : new_segndx = MOD(new_bucket, hashp->ssize);
9345 bruce 1569 EUB :
9345 bruce 1570 GBC 407775 : if (new_segnum >= hctl->nsegs)
9345 bruce 1571 ECB : {
8812 tgl 1572 EUB : /* Allocate new segment if necessary -- could fail if dir full */
9345 bruce 1573 CBC 1447 : if (new_segnum >= hctl->dsize)
8720 bruce 1574 UIC 0 : if (!dir_realloc(hashp))
7860 tgl 1575 0 : return false;
9345 bruce 1576 GIC 1447 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
7860 tgl 1577 LBC 0 : return false;
9345 bruce 1578 GIC 1447 : hctl->nsegs++;
1579 : }
1580 :
1581 : /* OK, we created a new bucket */
8812 tgl 1582 407775 : hctl->max_bucket++;
1583 :
1584 : /*
8714 tgl 1585 ECB : * *Before* changing masks, find old bucket corresponding to same hash
1586 : * values; values in that bucket may need to be relocated to new bucket.
1587 : * Note that new_bucket is certainly larger than low_mask at this point,
1588 : * so we can skip the first step of the regular hash mask calc.
1589 : */
8714 tgl 1590 CBC 407775 : old_bucket = (new_bucket & hctl->low_mask);
1591 :
8714 tgl 1592 ECB : /*
1593 : * If we crossed a power of 2, readjust masks.
1594 : */
7701 tgl 1595 GIC 407775 : if ((uint32) new_bucket > hctl->high_mask)
1596 : {
9345 bruce 1597 2635 : hctl->low_mask = hctl->high_mask;
7701 tgl 1598 2635 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1599 : }
1600 :
1601 : /*
3260 bruce 1602 ECB : * Relocate records to the new bucket. NOTE: because of the way the hash
6385 1603 : * masking is done in calc_bucket, only one old bucket can need to be
1604 : * split at this point. With a different way of reducing the hash value,
1605 : * that might not be true!
9345 1606 : */
6105 tgl 1607 GIC 407775 : old_segnum = old_bucket >> hashp->sshift;
6105 tgl 1608 CBC 407775 : old_segndx = MOD(old_bucket, hashp->ssize);
9345 bruce 1609 ECB :
7860 tgl 1610 GIC 407775 : old_seg = hashp->dir[old_segnum];
7860 tgl 1611 CBC 407775 : new_seg = hashp->dir[new_segnum];
9345 bruce 1612 ECB :
7860 tgl 1613 CBC 407775 : oldlink = &old_seg[old_segndx];
7860 tgl 1614 GIC 407775 : newlink = &new_seg[new_segndx];
7860 tgl 1615 ECB :
7860 tgl 1616 CBC 407775 : for (currElement = *oldlink;
7860 tgl 1617 GIC 973286 : currElement != NULL;
7860 tgl 1618 CBC 565511 : currElement = nextElement)
9345 bruce 1619 ECB : {
7860 tgl 1620 GIC 565511 : nextElement = currElement->link;
7173 1621 565511 : if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1622 : {
7860 tgl 1623 CBC 281169 : *oldlink = currElement;
1624 281169 : oldlink = &currElement->link;
1625 : }
1626 : else
1627 : {
1628 284342 : *newlink = currElement;
1629 284342 : newlink = &currElement->link;
1630 : }
9770 scrappy 1631 ECB : }
1632 : /* don't forget to terminate the rebuilt hash chains... */
7860 tgl 1633 GIC 407775 : *oldlink = NULL;
1634 407775 : *newlink = NULL;
1635 :
7860 tgl 1636 GBC 407775 : return true;
1637 : }
1638 :
1639 :
1640 : static bool
9344 bruce 1641 UIC 0 : dir_realloc(HTAB *hashp)
1642 : {
1643 : HASHSEGMENT *p;
7860 tgl 1644 EUB : HASHSEGMENT *old_p;
8812 1645 : long new_dsize;
1646 : long old_dirsize;
1647 : long new_dirsize;
9345 bruce 1648 :
9345 bruce 1649 UBC 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
7860 tgl 1650 0 : return false;
1651 :
9345 bruce 1652 EUB : /* Reallocate directory */
8812 tgl 1653 UBC 0 : new_dsize = hashp->hctl->dsize << 1;
7860 1654 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
7860 tgl 1655 UIC 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
9345 bruce 1656 EUB :
7860 tgl 1657 UIC 0 : old_p = hashp->dir;
7961 JanWieck 1658 UBC 0 : CurrentDynaHashCxt = hashp->hcxt;
7860 tgl 1659 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
8812 tgl 1660 EUB :
9345 bruce 1661 UBC 0 : if (p != NULL)
1662 : {
7860 tgl 1663 UIC 0 : memcpy(p, old_p, old_dirsize);
7860 tgl 1664 UBC 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1665 0 : hashp->dir = p;
8812 tgl 1666 UIC 0 : hashp->hctl->dsize = new_dsize;
6547 tgl 1667 EUB :
1668 : /* XXX assume the allocator is palloc, so we know how to free */
6547 tgl 1669 UIC 0 : Assert(hashp->alloc == DynaHashAlloc);
6547 tgl 1670 UBC 0 : pfree(old_p);
1671 :
7860 tgl 1672 UIC 0 : return true;
1673 : }
1674 :
7860 tgl 1675 LBC 0 : return false;
1676 : }
1677 :
1678 :
7860 tgl 1679 ECB : static HASHSEGMENT
9344 bruce 1680 CBC 666095 : seg_alloc(HTAB *hashp)
1681 : {
7836 bruce 1682 ECB : HASHSEGMENT segp;
9345 bruce 1683 EUB :
7961 JanWieck 1684 GIC 666095 : CurrentDynaHashCxt = hashp->hcxt;
6105 tgl 1685 CBC 666095 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1686 :
9345 bruce 1687 666095 : if (!segp)
7860 tgl 1688 UIC 0 : return NULL;
1689 :
6105 tgl 1690 GIC 666095 : MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1691 :
7860 1692 666095 : return segp;
1693 : }
9770 scrappy 1694 ECB :
1695 : /*
2573 rhaas 1696 : * allocate some new elements and link them into the indicated free list
1697 : */
1698 : static bool
2573 rhaas 1699 GIC 580132 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1700 : {
1701 580132 : HASHHDR *hctl = hashp->hctl;
1702 : Size elementSize;
6105 tgl 1703 ECB : HASHELEMENT *firstElement;
7860 tgl 1704 EUB : HASHELEMENT *tmpElement;
1705 : HASHELEMENT *prevElement;
1706 : int i;
9345 bruce 1707 ECB :
4381 heikki.linnakangas 1708 GIC 580132 : if (hashp->isfixed)
4381 heikki.linnakangas 1709 LBC 0 : return false;
4381 heikki.linnakangas 1710 ECB :
1711 : /* Each element has a HASHELEMENT header plus user data. */
2732 rhaas 1712 CBC 580132 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
9345 bruce 1713 EUB :
7961 JanWieck 1714 GIC 580132 : CurrentDynaHashCxt = hashp->hcxt;
6105 tgl 1715 580132 : firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
9345 bruce 1716 ECB :
6105 tgl 1717 CBC 580132 : if (!firstElement)
7860 tgl 1718 LBC 0 : return false;
1719 :
6105 tgl 1720 ECB : /* prepare to link all the new entries into the freelist */
6105 tgl 1721 CBC 580132 : prevElement = NULL;
1722 580132 : tmpElement = firstElement;
6767 tgl 1723 GIC 86955337 : for (i = 0; i < nelem; i++)
1724 : {
6105 1725 86375205 : tmpElement->link = prevElement;
6105 tgl 1726 CBC 86375205 : prevElement = tmpElement;
7860 1727 86375205 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1728 : }
1729 :
6105 tgl 1730 ECB : /* if partitioned, must lock to touch freeList */
2732 rhaas 1731 CBC 580132 : if (IS_PARTITIONED(hctl))
2573 rhaas 1732 GIC 293356 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
6105 tgl 1733 ECB :
1734 : /* freelist could be nonempty if two backends did this concurrently */
2573 rhaas 1735 GIC 580132 : firstElement->link = hctl->freeList[freelist_idx].freeList;
2573 rhaas 1736 CBC 580132 : hctl->freeList[freelist_idx].freeList = prevElement;
1737 :
2732 rhaas 1738 GIC 580132 : if (IS_PARTITIONED(hctl))
2573 1739 293356 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1740 :
7860 tgl 1741 GBC 580132 : return true;
1742 : }
1743 :
1744 : /* complain when we have detected a corrupted hashtable */
1745 : static void
7856 tgl 1746 UIC 0 : hash_corrupted(HTAB *hashp)
7856 tgl 1747 EUB : {
1748 : /*
1749 : * If the corruption is in a shared hashtable, we'd better force a
3260 bruce 1750 : * systemwide restart. Otherwise, just shut down this one backend.
1751 : */
7856 tgl 1752 UIC 0 : if (hashp->isshared)
7198 1753 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1754 : else
7198 tgl 1755 LBC 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1756 : }
1757 :
1758 : /* calculate ceil(log base 2) of num */
1759 : int
9770 scrappy 1760 GIC 547029 : my_log2(long num)
9770 scrappy 1761 ECB : {
1060 tgl 1762 EUB : /*
1763 : * guard against too-large input, which would be invalid for
1764 : * pg_ceil_log2_*()
1765 : */
3771 tgl 1766 GIC 547029 : if (num > LONG_MAX / 2)
3771 tgl 1767 LBC 0 : num = LONG_MAX / 2;
1768 :
1769 : #if SIZEOF_LONG < 8
1770 : return pg_ceil_log2_32(num);
1771 : #else
1096 drowley 1772 GIC 547029 : return pg_ceil_log2_64(num);
1096 drowley 1773 ECB : #endif
1774 : }
1775 :
3771 tgl 1776 : /* calculate first power of 2 >= num, bounded to what will fit in a long */
1777 : static long
3771 tgl 1778 GIC 63908 : next_pow2_long(long num)
1779 : {
1780 : /* my_log2's internal range check is sufficient */
3771 tgl 1781 CBC 63908 : return 1L << my_log2(num);
1782 : }
3771 tgl 1783 ECB :
3771 tgl 1784 EUB : /* calculate first power of 2 >= num, bounded to what will fit in an int */
3771 tgl 1785 ECB : static int
3771 tgl 1786 GIC 464314 : next_pow2_int(long num)
1787 : {
1788 464314 : if (num > INT_MAX / 2)
3771 tgl 1789 UIC 0 : num = INT_MAX / 2;
3771 tgl 1790 GIC 464314 : return 1 << my_log2(num);
1791 : }
1792 :
1793 :
1794 : /************************* SEQ SCAN TRACKING ************************/
1795 :
1796 : /*
1797 : * We track active hash_seq_search scans here. The need for this mechanism
1798 : * comes from the fact that a scan will get confused if a bucket split occurs
1799 : * while it's in progress: it might visit entries twice, or even miss some
1800 : * entirely (if it's partway through the same bucket that splits). Hence
1801 : * we want to inhibit bucket splits if there are any active scans on the
1802 : * table being inserted into. This is a fairly rare case in current usage,
1803 : * so just postponing the split until the next insertion seems sufficient.
1804 : *
1805 : * Given present usages of the function, only a few scans are likely to be
1806 : * open concurrently; so a finite-size stack of open scans seems sufficient,
1807 : * and we don't worry that linear search is too slow. Note that we do
1808 : * allow multiple scans of the same hashtable to be open concurrently.
1809 : *
1810 : * This mechanism can support concurrent scan and insertion in a shared
1811 : * hashtable if it's the same backend doing both. It would fail otherwise,
1812 : * but locking reasons seem to preclude any such scenario anyway, so we don't
1813 : * worry.
1814 : *
1815 : * This arrangement is reasonably robust if a transient hashtable is deleted
1816 : * without notifying us. The absolute worst case is we might inhibit splits
1817 : * in another table created later at exactly the same address. We will give
1818 : * a warning at transaction end for reference leaks, so any bugs leading to
1819 : * lack of notification should be easy to catch.
1820 : */
1821 :
1822 : #define MAX_SEQ_SCANS 100
1823 :
1824 : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1825 : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
5827 tgl 1826 ECB : static int num_seq_scans = 0;
1827 :
1828 :
5827 tgl 1829 EUB : /* Register a table as having an active hash_seq_search scan */
1830 : static void
5827 tgl 1831 CBC 3509197 : register_seq_scan(HTAB *hashp)
5827 tgl 1832 ECB : {
5827 tgl 1833 CBC 3509197 : if (num_seq_scans >= MAX_SEQ_SCANS)
5689 tgl 1834 LBC 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1835 : hashp->tabname);
5827 tgl 1836 GIC 3509197 : seq_scan_tables[num_seq_scans] = hashp;
1837 3509197 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
5827 tgl 1838 CBC 3509197 : num_seq_scans++;
5827 tgl 1839 GIC 3509197 : }
1840 :
1841 : /* Deregister an active scan */
1842 : static void
5827 tgl 1843 CBC 3509187 : deregister_seq_scan(HTAB *hashp)
1844 : {
5624 bruce 1845 ECB : int i;
1846 :
5827 tgl 1847 : /* Search backward since it's most likely at the stack top */
5827 tgl 1848 CBC 3509187 : for (i = num_seq_scans - 1; i >= 0; i--)
5827 tgl 1849 ECB : {
5827 tgl 1850 CBC 3509187 : if (seq_scan_tables[i] == hashp)
1851 : {
5827 tgl 1852 GIC 3509187 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
5827 tgl 1853 GBC 3509187 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
5827 tgl 1854 GIC 3509187 : num_seq_scans--;
1855 3509187 : return;
1856 : }
1857 : }
5827 tgl 1858 UIC 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
5827 tgl 1859 ECB : hashp->tabname);
1860 : }
1861 :
1862 : /* Check if a table has any active scan */
1863 : static bool
5827 tgl 1864 GIC 407775 : has_seq_scans(HTAB *hashp)
5827 tgl 1865 EUB : {
5624 bruce 1866 : int i;
1867 :
5827 tgl 1868 CBC 407775 : for (i = 0; i < num_seq_scans; i++)
1869 : {
5827 tgl 1870 UIC 0 : if (seq_scan_tables[i] == hashp)
1871 0 : return true;
1872 : }
5827 tgl 1873 CBC 407775 : return false;
1874 : }
1875 :
1876 : /* Clean up any open scans at end of transaction */
1877 : void
5827 tgl 1878 GIC 486642 : AtEOXact_HashTables(bool isCommit)
1879 : {
1880 : /*
1881 : * During abort cleanup, open scans are expected; just silently clean 'em
1882 : * out. An open scan at commit means someone forgot a hash_seq_term()
1883 : * call, so complain.
5827 tgl 1884 ECB : *
1885 : * Note: it's tempting to try to print the tabname here, but refrain for
1886 : * fear of touching deallocated memory. This isn't a user-facing message
1887 : * anyway, so it needn't be pretty.
1888 : */
5827 tgl 1889 GIC 486642 : if (isCommit)
5827 tgl 1890 EUB : {
1891 : int i;
1892 :
5827 tgl 1893 GIC 466482 : for (i = 0; i < num_seq_scans; i++)
5827 tgl 1894 ECB : {
5827 tgl 1895 LBC 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1896 : seq_scan_tables[i]);
1897 : }
1898 : }
5827 tgl 1899 CBC 486642 : num_seq_scans = 0;
5827 tgl 1900 GIC 486642 : }
1901 :
1902 : /* Clean up any open scans at end of subtransaction */
1903 : void
1904 8815 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1905 : {
1906 : int i;
1907 :
5827 tgl 1908 ECB : /*
1909 : * Search backward to make cleanup easy. Note we must check all entries,
5827 tgl 1910 EUB : * not only those at the end of the array, because deletion technique
1911 : * doesn't keep them in order.
1912 : */
5827 tgl 1913 GBC 8815 : for (i = num_seq_scans - 1; i >= 0; i--)
1914 : {
5827 tgl 1915 UBC 0 : if (seq_scan_level[i] >= nestDepth)
5827 tgl 1916 EUB : {
5827 tgl 1917 UBC 0 : if (isCommit)
5827 tgl 1918 UIC 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1919 : seq_scan_tables[i]);
5827 tgl 1920 LBC 0 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
5827 tgl 1921 UIC 0 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1922 0 : num_seq_scans--;
1923 : }
1924 : }
5827 tgl 1925 GIC 8815 : }
|