Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * dynahash.c
4 : : * dynamic chained hash tables
5 : : *
6 : : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : : * shared memory. For shared hash tables, it is the caller's responsibility
8 : : * to provide appropriate access interlocking. The simplest convention is
9 : : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : : * Therefore, each hash bucket chain operates independently, and no fields
17 : : * of the hash header change after init except nentries and freeList.
18 : : * (A partitioned table uses multiple copies of those fields, guarded by
19 : : * spinlocks, for additional concurrency.)
20 : : * This lets any subset of the hash buckets be treated as a separately
21 : : * lockable partition. We expect callers to use the low-order bits of a
22 : : * lookup key's hash value as a partition number --- this will work because
23 : : * of the way calc_bucket() maps hash values to bucket numbers.
24 : : *
25 : : * For hash tables in shared memory, the memory allocator function should
26 : : * match malloc's semantics of returning NULL on failure. For hash tables
27 : : * in local memory, we typically use palloc() which will throw error on
28 : : * failure. The code in this file has to cope with both cases.
29 : : *
30 : : * dynahash.c provides support for these types of lookup keys:
31 : : *
32 : : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 : : * compared as though by strcmp(). This is selected by specifying the
34 : : * HASH_STRINGS flag to hash_create.
35 : : *
36 : : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
37 : : * (Caller must ensure there are no undefined padding bits in the keys!)
38 : : * This is selected by specifying the HASH_BLOBS flag to hash_create.
39 : : *
40 : : * 3. More complex key behavior can be selected by specifying user-supplied
41 : : * hashing, comparison, and/or key-copying functions. At least a hashing
42 : : * function must be supplied; comparison defaults to memcmp() and key copying
43 : : * to memcpy() when a user-defined hashing function is selected.
44 : : *
45 : : * Compared to simplehash, dynahash has the following benefits:
46 : : *
47 : : * - It supports partitioning, which is useful for shared memory access using
48 : : * locks.
49 : : * - Shared memory hashes are allocated in a fixed size area at startup and
50 : : * are discoverable by name from other processes.
51 : : * - Because entries don't need to be moved in the case of hash conflicts,
52 : : * dynahash has better performance for large entries.
53 : : * - Guarantees stable pointers to entries.
54 : : *
55 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
56 : : * Portions Copyright (c) 1994, Regents of the University of California
57 : : *
58 : : *
59 : : * IDENTIFICATION
60 : : * src/backend/utils/hash/dynahash.c
61 : : *
62 : : *-------------------------------------------------------------------------
63 : : */
64 : :
65 : : /*
66 : : * Original comments:
67 : : *
68 : : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
69 : : * Coded into C, with minor code improvements, and with hsearch(3) interface,
70 : : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
71 : : * also, hcreate/hdestroy routines added to simulate hsearch(3).
72 : : *
73 : : * These routines simulate hsearch(3) and family, with the important
74 : : * difference that the hash table is dynamic - can grow indefinitely
75 : : * beyond its original size (as supplied to hcreate()).
76 : : *
77 : : * Performance appears to be comparable to that of hsearch(3).
78 : : * The 'source-code' options referred to in hsearch(3)'s 'man' page
79 : : * are not implemented; otherwise functionality is identical.
80 : : *
81 : : * Compilation controls:
82 : : * HASH_DEBUG controls some informative traces, mainly for debugging.
83 : : * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
84 : : * when combined with HASH_DEBUG, these are displayed by hdestroy().
85 : : *
86 : : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : : * concatenation property, in probably unnecessary code 'optimization'.
88 : : *
89 : : * Modified margo@postgres.berkeley.edu February 1990
90 : : * added multiple table interface
91 : : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : : * changed ctl structure for shared memory
93 : : */
94 : :
95 : : #include "postgres.h"
96 : :
97 : : #include <limits.h>
98 : :
99 : : #include "access/xact.h"
100 : : #include "common/hashfn.h"
101 : : #include "port/pg_bitutils.h"
102 : : #include "storage/shmem.h"
103 : : #include "storage/spin.h"
104 : : #include "utils/dynahash.h"
105 : : #include "utils/memutils.h"
106 : :
107 : :
108 : : /*
109 : : * Constants
110 : : *
111 : : * A hash table has a top-level "directory", each of whose entries points
112 : : * to a "segment" of ssize bucket headers. The maximum number of hash
113 : : * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 : : * the number of records in the table can be larger, but we don't want a
115 : : * whole lot of records per bucket or performance goes down.
116 : : *
117 : : * In a hash table allocated in shared memory, the directory cannot be
118 : : * expanded because it must stay at a fixed address. The directory size
119 : : * should be selected using hash_select_dirsize (and you'd better have
120 : : * a good idea of the maximum number of entries!). For non-shared hash
121 : : * tables, the initial directory size can be left at the default.
122 : : */
123 : : #define DEF_SEGSIZE 256
124 : : #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 : : #define DEF_DIRSIZE 256
126 : :
127 : : /* Number of freelists to be used for a partitioned hash table. */
128 : : #define NUM_FREELISTS 32
129 : :
130 : : /* A hash bucket is a linked list of HASHELEMENTs */
131 : : typedef HASHELEMENT *HASHBUCKET;
132 : :
133 : : /* A hash segment is an array of bucket headers */
134 : : typedef HASHBUCKET *HASHSEGMENT;
135 : :
136 : : /*
137 : : * Per-freelist data.
138 : : *
139 : : * In a partitioned hash table, each freelist is associated with a specific
140 : : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 : : * nentries tracks the number of live hashtable entries having those hashcodes
142 : : * (NOT the number of entries in the freelist, as you might expect).
143 : : *
144 : : * The coverage of a freelist might be more or less than one partition, so it
145 : : * needs its own lock rather than relying on caller locking. Relying on that
146 : : * wouldn't work even if the coverage was the same, because of the occasional
147 : : * need to "borrow" entries from another freelist; see get_hash_entry().
148 : : *
149 : : * Using an array of FreeListData instead of separate arrays of mutexes,
150 : : * nentries and freeLists helps to reduce sharing of cache lines between
151 : : * different mutexes.
152 : : */
153 : : typedef struct
154 : : {
155 : : slock_t mutex; /* spinlock for this freelist */
156 : : long nentries; /* number of entries in associated buckets */
157 : : HASHELEMENT *freeList; /* chain of free elements */
158 : : } FreeListData;
159 : :
160 : : /*
161 : : * Header structure for a hash table --- contains all changeable info
162 : : *
163 : : * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 : : * each backend has a local HTAB struct. For a non-shared table, there isn't
165 : : * any functional difference between HASHHDR and HTAB, but we separate them
166 : : * anyway to share code between shared and non-shared tables.
167 : : */
168 : : struct HASHHDR
169 : : {
170 : : /*
171 : : * The freelist can become a point of contention in high-concurrency hash
172 : : * tables, so we use an array of freelists, each with its own mutex and
173 : : * nentries count, instead of just a single one. Although the freelists
174 : : * normally operate independently, we will scavenge entries from freelists
175 : : * other than a hashcode's default freelist when necessary.
176 : : *
177 : : * If the hash table is not partitioned, only freeList[0] is used and its
178 : : * spinlock is not used at all; callers' locking is assumed sufficient.
179 : : */
180 : : FreeListData freeList[NUM_FREELISTS];
181 : :
182 : : /* These fields can change, but not in a partitioned table */
183 : : /* Also, dsize can't change in a shared table, even if unpartitioned */
184 : : long dsize; /* directory size */
185 : : long nsegs; /* number of allocated segments (<= dsize) */
186 : : uint32 max_bucket; /* ID of maximum bucket in use */
187 : : uint32 high_mask; /* mask to modulo into entire table */
188 : : uint32 low_mask; /* mask to modulo into lower half of table */
189 : :
190 : : /* These fields are fixed at hashtable creation */
191 : : Size keysize; /* hash key length in bytes */
192 : : Size entrysize; /* total user element size in bytes */
193 : : long num_partitions; /* # partitions (must be power of 2), or 0 */
194 : : long max_dsize; /* 'dsize' limit if directory is fixed size */
195 : : long ssize; /* segment size --- must be power of 2 */
196 : : int sshift; /* segment shift = log2(ssize) */
197 : : int nelem_alloc; /* number of entries to allocate at once */
198 : :
199 : : #ifdef HASH_STATISTICS
200 : :
201 : : /*
202 : : * Count statistics here. NB: stats code doesn't bother with mutex, so
203 : : * counts could be corrupted a bit in a partitioned table.
204 : : */
205 : : long accesses;
206 : : long collisions;
207 : : #endif
208 : : };
209 : :
210 : : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
211 : :
212 : : #define FREELIST_IDX(hctl, hashcode) \
213 : : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
214 : :
215 : : /*
216 : : * Top control structure for a hashtable --- in a shared table, each backend
217 : : * has its own copy (OK since no fields change at runtime)
218 : : */
219 : : struct HTAB
220 : : {
221 : : HASHHDR *hctl; /* => shared control information */
222 : : HASHSEGMENT *dir; /* directory of segment starts */
223 : : HashValueFunc hash; /* hash function */
224 : : HashCompareFunc match; /* key comparison function */
225 : : HashCopyFunc keycopy; /* key copying function */
226 : : HashAllocFunc alloc; /* memory allocator */
227 : : MemoryContext hcxt; /* memory context if default allocator used */
228 : : char *tabname; /* table name (for error messages) */
229 : : bool isshared; /* true if table is in shared memory */
230 : : bool isfixed; /* if true, don't enlarge */
231 : :
232 : : /* freezing a shared table isn't allowed, so we can keep state here */
233 : : bool frozen; /* true = no more inserts allowed */
234 : :
235 : : /* We keep local copies of these fixed values to reduce contention */
236 : : Size keysize; /* hash key length in bytes */
237 : : long ssize; /* segment size --- must be power of 2 */
238 : : int sshift; /* segment shift = log2(ssize) */
239 : : };
240 : :
241 : : /*
242 : : * Key (also entry) part of a HASHELEMENT
243 : : */
244 : : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
245 : :
246 : : /*
247 : : * Obtain element pointer given pointer to key
248 : : */
249 : : #define ELEMENT_FROM_KEY(key) \
250 : : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
251 : :
252 : : /*
253 : : * Fast MOD arithmetic, assuming that y is a power of 2 !
254 : : */
255 : : #define MOD(x,y) ((x) & ((y)-1))
256 : :
257 : : #ifdef HASH_STATISTICS
258 : : static long hash_accesses,
259 : : hash_collisions,
260 : : hash_expansions;
261 : : #endif
262 : :
263 : : /*
264 : : * Private function prototypes
265 : : */
266 : : static void *DynaHashAlloc(Size size);
267 : : static HASHSEGMENT seg_alloc(HTAB *hashp);
268 : : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
269 : : static bool dir_realloc(HTAB *hashp);
270 : : static bool expand_table(HTAB *hashp);
271 : : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
272 : : static void hdefault(HTAB *hashp);
273 : : static int choose_nelem_alloc(Size entrysize);
274 : : static bool init_htab(HTAB *hashp, long nelem);
275 : : static void hash_corrupted(HTAB *hashp) pg_attribute_noreturn();
276 : : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
277 : : HASHBUCKET **bucketptr);
278 : : static long next_pow2_long(long num);
279 : : static int next_pow2_int(long num);
280 : : static void register_seq_scan(HTAB *hashp);
281 : : static void deregister_seq_scan(HTAB *hashp);
282 : : static bool has_seq_scans(HTAB *hashp);
283 : :
284 : :
285 : : /*
286 : : * memory allocation support
287 : : */
288 : : static MemoryContext CurrentDynaHashCxt = NULL;
289 : :
290 : : static void *
8691 tgl@sss.pgh.pa.us 291 :CBC 1126930 : DynaHashAlloc(Size size)
292 : : {
8332 JanWieck@Yahoo.com 293 [ + - - + : 1126930 : Assert(MemoryContextIsValid(CurrentDynaHashCxt));
- - - - -
- ]
548 tgl@sss.pgh.pa.us 294 : 1126930 : return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
295 : : MCXT_ALLOC_NO_OOM);
296 : : }
297 : :
298 : :
299 : : /*
300 : : * HashCompareFunc for string keys
301 : : *
302 : : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
303 : : * bytes, so we can only compare that many ... hence strncmp is almost but
304 : : * not quite the right thing.
305 : : */
306 : : static int
6409 307 : 417324 : string_compare(const char *key1, const char *key2, Size keysize)
308 : : {
309 : 417324 : return strncmp(key1, key2, keysize - 1);
310 : : }
311 : :
312 : :
313 : : /************************** CREATE ROUTINES **********************/
314 : :
315 : : /*
316 : : * hash_create -- create a new dynamic hash table
317 : : *
318 : : * tabname: a name for the table (for debugging purposes)
319 : : * nelem: maximum number of elements expected
320 : : * *info: additional table parameters, as indicated by flags
321 : : * flags: bitmask indicating which parameters to take from *info
322 : : *
323 : : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
324 : : * optional, but the default keysize and entrysize values were useless.)
325 : : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
326 : : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
327 : : * binary blobs, or custom, respectively). Callers specifying a custom
328 : : * hash function will likely also want to use HASH_COMPARE, and perhaps
329 : : * also HASH_KEYCOPY, to control key comparison and copying.
330 : : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
331 : : * under info->hcxt rather than under TopMemoryContext; the default
332 : : * behavior is only suitable for session-lifespan hash tables.
333 : : * Other flags bits are special-purpose and seldom used, except for those
334 : : * associated with shared-memory hash tables, for which see ShmemInitHash().
335 : : *
336 : : * Fields in *info are read only when the associated flags bit is set.
337 : : * It is not necessary to initialize other fields of *info.
338 : : * Neither tabname nor *info need persist after the hash_create() call.
339 : : *
340 : : * Note: It is deprecated for callers of hash_create() to explicitly specify
341 : : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
342 : : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
343 : : * one of these.
344 : : *
345 : : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
346 : : * estimate, since we can't expand the table on the fly. But an unshared
347 : : * hashtable can be expanded on-the-fly, so it's better for nelem to be
348 : : * on the small side and let the table grow if it's exceeded. An overly
349 : : * large nelem will penalize hash_seq_search speed without buying much.
350 : : */
351 : : HTAB *
1216 352 : 208926 : hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
353 : : {
354 : : HTAB *hashp;
355 : : HASHHDR *hctl;
356 : :
357 : : /*
358 : : * Hash tables now allocate space for key and data, but you have to say
359 : : * how much space to allocate.
360 : : */
361 [ - + ]: 208926 : Assert(flags & HASH_ELEM);
362 [ - + ]: 208926 : Assert(info->keysize > 0);
363 [ - + ]: 208926 : Assert(info->entrysize >= info->keysize);
364 : :
365 : : /*
366 : : * For shared hash tables, we have a local hash header (HTAB struct) that
367 : : * we allocate in TopMemoryContext; all else is in shared memory.
368 : : *
369 : : * For non-shared hash tables, everything including the hash header is in
370 : : * a memory context created specially for the hash table --- this makes
371 : : * hash_destroy very simple. The memory context is made a child of either
372 : : * a context specified by the caller, or TopMemoryContext if nothing is
373 : : * specified.
374 : : */
6918 375 [ + + ]: 208926 : if (flags & HASH_SHARED_MEM)
376 : : {
377 : : /* Set up to allocate the hash header */
378 : 8986 : CurrentDynaHashCxt = TopMemoryContext;
379 : : }
380 : : else
381 : : {
382 : : /* Create the hash table's private memory context */
383 [ + + ]: 199940 : if (flags & HASH_CONTEXT)
384 : 85819 : CurrentDynaHashCxt = info->hcxt;
385 : : else
386 : 114121 : CurrentDynaHashCxt = TopMemoryContext;
2210 387 : 199940 : CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
388 : : "dynahash",
389 : : ALLOCSET_DEFAULT_SIZES);
390 : : }
391 : :
392 : : /* Initialize the hash header, plus a copy of the table name */
2489 393 : 208926 : hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
9705 bruce@momjian.us 394 [ + - + - : 2716038 : MemSet(hashp, 0, sizeof(HTAB));
+ - + - +
+ ]
395 : :
6918 tgl@sss.pgh.pa.us 396 : 208926 : hashp->tabname = (char *) (hashp + 1);
8227 397 : 208926 : strcpy(hashp->tabname, tabname);
398 : :
399 : : /* If we have a private context, label it with hashtable's name */
2210 400 [ + + ]: 208926 : if (!(flags & HASH_SHARED_MEM))
401 : 199940 : MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
402 : :
403 : : /*
404 : : * Select the appropriate hash function (see comments at head of file).
405 : : */
9716 bruce@momjian.us 406 [ + + ]: 208926 : if (flags & HASH_FUNCTION)
407 : : {
1216 tgl@sss.pgh.pa.us 408 [ - + ]: 7286 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
9716 bruce@momjian.us 409 : 7286 : hashp->hash = info->hash;
410 : : }
3405 tgl@sss.pgh.pa.us 411 [ + + ]: 201640 : else if (flags & HASH_BLOBS)
412 : : {
1216 413 [ - + ]: 154098 : Assert(!(flags & HASH_STRINGS));
414 : : /* We can optimize hashing for common key sizes */
3405 415 [ + + ]: 154098 : if (info->keysize == sizeof(uint32))
416 : 92598 : hashp->hash = uint32_hash;
417 : : else
418 : 61500 : hashp->hash = tag_hash;
419 : : }
420 : : else
421 : : {
422 : : /*
423 : : * string_hash used to be considered the default hash method, and in a
424 : : * non-assert build it effectively still is. But we now consider it
425 : : * an assertion error to not say HASH_STRINGS explicitly. To help
426 : : * catch mistaken usage of HASH_STRINGS, we also insist on a
427 : : * reasonably long string length: if the keysize is only 4 or 8 bytes,
428 : : * it's almost certainly an integer or pointer not a string.
429 : : */
1216 430 [ - + ]: 47542 : Assert(flags & HASH_STRINGS);
431 [ - + ]: 47542 : Assert(info->keysize > 8);
432 : :
433 : 47542 : hashp->hash = string_hash;
434 : : }
435 : :
436 : : /*
437 : : * If you don't specify a match function, it defaults to string_compare if
438 : : * you used string_hash, and to memcmp otherwise.
439 : : *
440 : : * Note: explicitly specifying string_hash is deprecated, because this
441 : : * might not work for callers in loadable modules on some platforms due to
442 : : * referencing a trampoline instead of the string_hash function proper.
443 : : * Specify HASH_STRINGS instead.
444 : : */
7544 445 [ + + ]: 208926 : if (flags & HASH_COMPARE)
446 : 5490 : hashp->match = info->match;
447 [ + + ]: 203436 : else if (hashp->hash == string_hash)
6409 448 : 47542 : hashp->match = (HashCompareFunc) string_compare;
449 : : else
7544 450 : 155894 : hashp->match = memcmp;
451 : :
452 : : /*
453 : : * Similarly, the key-copying function defaults to strlcpy or memcpy.
454 : : */
6875 455 [ - + ]: 208926 : if (flags & HASH_KEYCOPY)
6875 tgl@sss.pgh.pa.us 456 :UBC 0 : hashp->keycopy = info->keycopy;
6875 tgl@sss.pgh.pa.us 457 [ + + ]:CBC 208926 : else if (hashp->hash == string_hash)
458 : : {
459 : : /*
460 : : * The signature of keycopy is meant for memcpy(), which returns
461 : : * void*, but strlcpy() returns size_t. Since we never use the return
462 : : * value of keycopy, and size_t is pretty much always the same size as
463 : : * void *, this should be safe. The extra cast in the middle is to
464 : : * avoid warnings from -Wcast-function-type.
465 : : */
1370 peter@eisentraut.org 466 : 47542 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
467 : : }
468 : : else
6875 tgl@sss.pgh.pa.us 469 : 161384 : hashp->keycopy = memcpy;
470 : :
471 : : /* And select the entry allocation function, too. */
6918 472 [ + + ]: 208926 : if (flags & HASH_ALLOC)
473 : 8986 : hashp->alloc = info->alloc;
474 : : else
475 : 199940 : hashp->alloc = DynaHashAlloc;
476 : :
9716 bruce@momjian.us 477 [ + + ]: 208926 : if (flags & HASH_SHARED_MEM)
478 : : {
479 : : /*
480 : : * ctl structure and directory are preallocated for shared memory
481 : : * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
482 : : * well.
483 : : */
8231 tgl@sss.pgh.pa.us 484 : 8986 : hashp->hctl = info->hctl;
6476 485 : 8986 : hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
8332 JanWieck@Yahoo.com 486 : 8986 : hashp->hcxt = NULL;
8227 tgl@sss.pgh.pa.us 487 : 8986 : hashp->isshared = true;
488 : :
489 : : /* hash table already exists, we're just attaching to it */
9716 bruce@momjian.us 490 [ - + ]: 8986 : if (flags & HASH_ATTACH)
491 : : {
492 : : /* make local copies of some heavily-used values */
6476 tgl@sss.pgh.pa.us 493 :UBC 0 : hctl = hashp->hctl;
494 : 0 : hashp->keysize = hctl->keysize;
495 : 0 : hashp->ssize = hctl->ssize;
496 : 0 : hashp->sshift = hctl->sshift;
497 : :
9357 bruce@momjian.us 498 : 0 : return hashp;
499 : : }
500 : : }
501 : : else
502 : : {
503 : : /* setup hash table defaults */
9183 tgl@sss.pgh.pa.us 504 :CBC 199940 : hashp->hctl = NULL;
9716 bruce@momjian.us 505 : 199940 : hashp->dir = NULL;
7544 tgl@sss.pgh.pa.us 506 : 199940 : hashp->hcxt = CurrentDynaHashCxt;
8227 507 : 199940 : hashp->isshared = false;
508 : : }
509 : :
9716 bruce@momjian.us 510 [ + + ]: 208926 : if (!hashp->hctl)
511 : : {
8231 tgl@sss.pgh.pa.us 512 : 199940 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
9716 bruce@momjian.us 513 [ - + ]: 199940 : if (!hashp->hctl)
7111 neilc@samurai.com 514 [ # # ]:UBC 0 : ereport(ERROR,
515 : : (errcode(ERRCODE_OUT_OF_MEMORY),
516 : : errmsg("out of memory")));
517 : : }
518 : :
6198 tgl@sss.pgh.pa.us 519 :CBC 208926 : hashp->frozen = false;
520 : :
7114 neilc@samurai.com 521 : 208926 : hdefault(hashp);
522 : :
9716 bruce@momjian.us 523 : 208926 : hctl = hashp->hctl;
524 : :
6476 tgl@sss.pgh.pa.us 525 [ + + ]: 208926 : if (flags & HASH_PARTITION)
526 : : {
527 : : /* Doesn't make sense to partition a local hash table */
528 [ - + ]: 4490 : Assert(flags & HASH_SHARED_MEM);
529 : :
530 : : /*
531 : : * The number of partitions had better be a power of 2. Also, it must
532 : : * be less than INT_MAX (see init_htab()), so call the int version of
533 : : * next_pow2.
534 : : */
4142 535 [ - + ]: 4490 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
536 : :
6476 537 : 4490 : hctl->num_partitions = info->num_partitions;
538 : : }
539 : :
9716 bruce@momjian.us 540 [ - + ]: 208926 : if (flags & HASH_SEGMENT)
541 : : {
9716 bruce@momjian.us 542 :UBC 0 : hctl->ssize = info->ssize;
543 : 0 : hctl->sshift = my_log2(info->ssize);
544 : : /* ssize had better be a power of 2 */
9085 tgl@sss.pgh.pa.us 545 [ # # ]: 0 : Assert(hctl->ssize == (1L << hctl->sshift));
546 : : }
547 : :
548 : : /*
549 : : * SHM hash tables have fixed directory size passed by the caller.
550 : : */
9716 bruce@momjian.us 551 [ + + ]:CBC 208926 : if (flags & HASH_DIRSIZE)
552 : : {
9183 tgl@sss.pgh.pa.us 553 : 8986 : hctl->max_dsize = info->max_dsize;
554 : 8986 : hctl->dsize = info->dsize;
555 : : }
556 : :
557 : : /* remember the entry sizes, too */
1216 558 : 208926 : hctl->keysize = info->keysize;
559 : 208926 : hctl->entrysize = info->entrysize;
560 : :
561 : : /* make local copies of heavily-used constant fields */
6476 562 : 208926 : hashp->keysize = hctl->keysize;
563 : 208926 : hashp->ssize = hctl->ssize;
564 : 208926 : hashp->sshift = hctl->sshift;
565 : :
566 : : /* Build the hash directory structure */
8231 567 [ - + ]: 208926 : if (!init_htab(hashp, nelem))
6060 tgl@sss.pgh.pa.us 568 [ # # ]:UBC 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
569 : :
570 : : /*
571 : : * For a shared hash table, preallocate the requested number of elements.
572 : : * This reduces problems with run-time out-of-shared-memory conditions.
573 : : *
574 : : * For a non-shared hash table, preallocate the requested number of
575 : : * elements if it's less than our chosen nelem_alloc. This avoids wasting
576 : : * space if the caller correctly estimates a small table size.
577 : : */
6503 tgl@sss.pgh.pa.us 578 [ + + ]:CBC 208926 : if ((flags & HASH_SHARED_MEM) ||
579 [ + + ]: 199940 : nelem < hctl->nelem_alloc)
580 : : {
581 : : int i,
582 : : freelist_partitions,
583 : : nelem_alloc,
584 : : nelem_alloc_first;
585 : :
586 : : /*
587 : : * If hash table is partitioned, give each freelist an equal share of
588 : : * the initial allocation. Otherwise only freeList[0] is used.
589 : : */
2944 rhaas@postgresql.org 590 [ + + ]: 82178 : if (IS_PARTITIONED(hashp->hctl))
591 : 4490 : freelist_partitions = NUM_FREELISTS;
592 : : else
593 : 77688 : freelist_partitions = 1;
594 : :
595 : 82178 : nelem_alloc = nelem / freelist_partitions;
2458 tgl@sss.pgh.pa.us 596 [ - + ]: 82178 : if (nelem_alloc <= 0)
2944 rhaas@postgresql.org 597 :UBC 0 : nelem_alloc = 1;
598 : :
599 : : /*
600 : : * Make sure we'll allocate all the requested elements; freeList[0]
601 : : * gets the excess if the request isn't divisible by NUM_FREELISTS.
602 : : */
2944 rhaas@postgresql.org 603 [ + + ]:CBC 82178 : if (nelem_alloc * freelist_partitions < nelem)
604 : 41 : nelem_alloc_first =
605 : 41 : nelem - nelem_alloc * (freelist_partitions - 1);
606 : : else
607 : 82137 : nelem_alloc_first = nelem_alloc;
608 : :
609 [ + + ]: 303546 : for (i = 0; i < freelist_partitions; i++)
610 : : {
611 [ + + ]: 221368 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
612 : :
613 [ - + ]: 221368 : if (!element_alloc(hashp, temp, i))
2944 rhaas@postgresql.org 614 [ # # ]:UBC 0 : ereport(ERROR,
615 : : (errcode(ERRCODE_OUT_OF_MEMORY),
616 : : errmsg("out of memory")));
617 : : }
618 : : }
619 : :
4752 heikki.linnakangas@i 620 [ + + ]:CBC 208926 : if (flags & HASH_FIXED_SIZE)
621 : 3592 : hashp->isfixed = true;
9357 bruce@momjian.us 622 : 208926 : return hashp;
623 : : }
624 : :
625 : : /*
626 : : * Set default HASHHDR parameters.
627 : : */
628 : : static void
9715 629 : 208926 : hdefault(HTAB *hashp)
630 : : {
8207 631 : 208926 : HASHHDR *hctl = hashp->hctl;
632 : :
8231 tgl@sss.pgh.pa.us 633 [ + - + - : 22355082 : MemSet(hctl, 0, sizeof(HASHHDR));
+ - + - +
+ ]
634 : :
6476 635 : 208926 : hctl->dsize = DEF_DIRSIZE;
9716 bruce@momjian.us 636 : 208926 : hctl->nsegs = 0;
637 : :
6476 tgl@sss.pgh.pa.us 638 : 208926 : hctl->num_partitions = 0; /* not partitioned */
639 : :
640 : : /* table has no fixed maximum size */
9716 bruce@momjian.us 641 : 208926 : hctl->max_dsize = NO_MAX_DSIZE;
642 : :
6476 tgl@sss.pgh.pa.us 643 : 208926 : hctl->ssize = DEF_SEGSIZE;
644 : 208926 : hctl->sshift = DEF_SEGSIZE_SHIFT;
645 : :
646 : : #ifdef HASH_STATISTICS
647 : : hctl->accesses = hctl->collisions = 0;
648 : : #endif
10141 scrappy@hub.org 649 : 208926 : }
650 : :
651 : : /*
652 : : * Given the user-specified entry size, choose nelem_alloc, ie, how many
653 : : * elements to add to the hash table when we need more.
654 : : */
655 : : static int
6503 tgl@sss.pgh.pa.us 656 : 225722 : choose_nelem_alloc(Size entrysize)
657 : : {
658 : : int nelem_alloc;
659 : : Size elementSize;
660 : : Size allocSize;
661 : :
662 : : /* Each element has a HASHELEMENT header plus user data. */
663 : : /* NB: this had better match element_alloc() */
664 : 225722 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
665 : :
666 : : /*
667 : : * The idea here is to choose nelem_alloc at least 32, but round up so
668 : : * that the allocation request will be a power of 2 or just less. This
669 : : * makes little difference for hash tables in shared memory, but for hash
670 : : * tables managed by palloc, the allocation request will be rounded up to
671 : : * a power of 2 anyway. If we fail to take this into account, we'll waste
672 : : * as much as half the allocated space.
673 : : */
674 : 225722 : allocSize = 32 * 4; /* assume elementSize at least 8 */
675 : : do
676 : : {
677 : 933919 : allocSize <<= 1;
678 : 933919 : nelem_alloc = allocSize / elementSize;
679 [ + + ]: 933919 : } while (nelem_alloc < 32);
680 : :
681 : 225722 : return nelem_alloc;
682 : : }
683 : :
684 : : /*
685 : : * Compute derived fields of hctl and build the initial directory/segment
686 : : * arrays
687 : : */
688 : : static bool
8231 689 : 208926 : init_htab(HTAB *hashp, long nelem)
690 : : {
8207 bruce@momjian.us 691 : 208926 : HASHHDR *hctl = hashp->hctl;
692 : : HASHSEGMENT *segp;
693 : : int nbuckets;
694 : : int nsegs;
695 : : int i;
696 : :
697 : : /*
698 : : * initialize mutexes if it's a partitioned table
699 : : */
6476 tgl@sss.pgh.pa.us 700 [ + + ]: 208926 : if (IS_PARTITIONED(hctl))
2944 rhaas@postgresql.org 701 [ + + ]: 148170 : for (i = 0; i < NUM_FREELISTS; i++)
702 : 143680 : SpinLockInit(&(hctl->freeList[i].mutex));
703 : :
704 : : /*
705 : : * Allocate space for the next greater power of two number of buckets,
706 : : * assuming a desired maximum load factor of 1.
707 : : */
1303 tmunro@postgresql.or 708 : 208926 : nbuckets = next_pow2_int(nelem);
709 : :
710 : : /*
711 : : * In a partitioned table, nbuckets must be at least equal to
712 : : * num_partitions; were it less, keys with apparently different partition
713 : : * numbers would map to the same bucket, breaking partition independence.
714 : : * (Normally nbuckets will be much bigger; this is just a safety check.)
715 : : */
6476 tgl@sss.pgh.pa.us 716 [ - + ]: 208926 : while (nbuckets < hctl->num_partitions)
6476 tgl@sss.pgh.pa.us 717 :UBC 0 : nbuckets <<= 1;
718 : :
9716 bruce@momjian.us 719 :CBC 208926 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
720 : 208926 : hctl->high_mask = (nbuckets << 1) - 1;
721 : :
722 : : /*
723 : : * Figure number of directory segments needed, round up to a power of 2
724 : : */
725 : 208926 : nsegs = (nbuckets - 1) / hctl->ssize + 1;
4142 tgl@sss.pgh.pa.us 726 : 208926 : nsegs = next_pow2_int(nsegs);
727 : :
728 : : /*
729 : : * Make sure directory is big enough. If pre-allocated directory is too
730 : : * small, choke (caller screwed up).
731 : : */
9183 732 [ - + ]: 208926 : if (nsegs > hctl->dsize)
733 : : {
9183 tgl@sss.pgh.pa.us 734 [ # # ]:UBC 0 : if (!(hashp->dir))
735 : 0 : hctl->dsize = nsegs;
736 : : else
8231 737 : 0 : return false;
738 : : }
739 : :
740 : : /* Allocate a directory */
9716 bruce@momjian.us 741 [ + + ]:CBC 208926 : if (!(hashp->dir))
742 : : {
8332 JanWieck@Yahoo.com 743 : 199940 : CurrentDynaHashCxt = hashp->hcxt;
8231 tgl@sss.pgh.pa.us 744 : 199940 : hashp->dir = (HASHSEGMENT *)
745 : 199940 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
9716 bruce@momjian.us 746 [ - + ]: 199940 : if (!hashp->dir)
8231 tgl@sss.pgh.pa.us 747 :UBC 0 : return false;
748 : : }
749 : :
750 : : /* Allocate initial segments */
9716 bruce@momjian.us 751 [ + + ]:CBC 619113 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
752 : : {
753 : 410187 : *segp = seg_alloc(hashp);
8231 tgl@sss.pgh.pa.us 754 [ - + ]: 410187 : if (*segp == NULL)
8231 tgl@sss.pgh.pa.us 755 :UBC 0 : return false;
756 : : }
757 : :
758 : : /* Choose number of entries to allocate at a time */
6503 tgl@sss.pgh.pa.us 759 :CBC 208926 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
760 : :
761 : : #ifdef HASH_DEBUG
762 : : fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
763 : : "TABLE POINTER ", hashp,
764 : : "DIRECTORY SIZE ", hctl->dsize,
765 : : "SEGMENT SIZE ", hctl->ssize,
766 : : "SEGMENT SHIFT ", hctl->sshift,
767 : : "MAX BUCKET ", hctl->max_bucket,
768 : : "HIGH MASK ", hctl->high_mask,
769 : : "LOW MASK ", hctl->low_mask,
770 : : "NSEGS ", hctl->nsegs);
771 : : #endif
8231 772 : 208926 : return true;
773 : : }
774 : :
775 : : /*
776 : : * Estimate the space needed for a hashtable containing the given number
777 : : * of entries of given size.
778 : : * NOTE: this is used to estimate the footprint of hashtables in shared
779 : : * memory; therefore it does not count HTAB which is in local memory.
780 : : * NB: assumes that all hash structure parameters have default values!
781 : : */
782 : : Size
7544 783 : 16796 : hash_estimate_size(long num_entries, Size entrysize)
784 : : {
785 : : Size size;
786 : : long nBuckets,
787 : : nSegments,
788 : : nDirEntries,
789 : : nElementAllocs,
790 : : elementSize,
791 : : elementAllocCnt;
792 : :
793 : : /* estimate number of buckets wanted */
1303 tmunro@postgresql.or 794 : 16796 : nBuckets = next_pow2_long(num_entries);
795 : : /* # of segments needed for nBuckets */
4142 tgl@sss.pgh.pa.us 796 : 16796 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
797 : : /* directory entries */
9171 798 : 16796 : nDirEntries = DEF_DIRSIZE;
799 [ - + ]: 16796 : while (nDirEntries < nSegments)
9171 tgl@sss.pgh.pa.us 800 :UBC 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
801 : :
802 : : /* fixed control info */
6812 tgl@sss.pgh.pa.us 803 :CBC 16796 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
804 : : /* directory */
805 : 16796 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
806 : : /* segments */
807 : 16796 : size = add_size(size, mul_size(nSegments,
808 : : MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
809 : : /* elements --- allocated in groups of choose_nelem_alloc() entries */
6503 810 : 16796 : elementAllocCnt = choose_nelem_alloc(entrysize);
6867 811 : 16796 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
6503 812 : 16796 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
6812 813 : 16796 : size = add_size(size,
814 : : mul_size(nElementAllocs,
815 : : mul_size(elementAllocCnt, elementSize)));
816 : :
9183 817 : 16796 : return size;
818 : : }
819 : :
820 : : /*
821 : : * Select an appropriate directory size for a hashtable with the given
822 : : * maximum number of entries.
823 : : * This is only needed for hashtables in shared memory, whose directories
824 : : * cannot be expanded dynamically.
825 : : * NB: assumes that all hash structure parameters have default values!
826 : : *
827 : : * XXX this had better agree with the behavior of init_htab()...
828 : : */
829 : : long
8814 830 : 8986 : hash_select_dirsize(long num_entries)
831 : : {
832 : : long nBuckets,
833 : : nSegments,
834 : : nDirEntries;
835 : :
836 : : /* estimate number of buckets wanted */
1303 tmunro@postgresql.or 837 : 8986 : nBuckets = next_pow2_long(num_entries);
838 : : /* # of segments needed for nBuckets */
4142 tgl@sss.pgh.pa.us 839 : 8986 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
840 : : /* directory entries */
8814 841 : 8986 : nDirEntries = DEF_DIRSIZE;
842 [ - + ]: 8986 : while (nDirEntries < nSegments)
8814 tgl@sss.pgh.pa.us 843 :UBC 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
844 : :
8814 tgl@sss.pgh.pa.us 845 :CBC 8986 : return nDirEntries;
846 : : }
847 : :
848 : : /*
849 : : * Compute the required initial memory allocation for a shared-memory
850 : : * hashtable with the given parameters. We need space for the HASHHDR
851 : : * and for the (non expansible) directory.
852 : : */
853 : : Size
6476 854 : 8986 : hash_get_shared_size(HASHCTL *info, int flags)
855 : : {
856 [ - + ]: 8986 : Assert(flags & HASH_DIRSIZE);
857 [ - + ]: 8986 : Assert(info->dsize == info->max_dsize);
858 : 8986 : return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
859 : : }
860 : :
861 : :
862 : : /********************** DESTROY ROUTINES ************************/
863 : :
864 : : void
9715 bruce@momjian.us 865 : 41706 : hash_destroy(HTAB *hashp)
866 : : {
9716 867 [ + - ]: 41706 : if (hashp != NULL)
868 : : {
869 : : /* allocation method must be one we know how to free, too */
6918 tgl@sss.pgh.pa.us 870 [ - + ]: 41706 : Assert(hashp->alloc == DynaHashAlloc);
871 : : /* so this hashtable must have its own context */
8332 JanWieck@Yahoo.com 872 [ - + ]: 41706 : Assert(hashp->hcxt != NULL);
873 : :
9171 tgl@sss.pgh.pa.us 874 : 41706 : hash_stats("destroy", hashp);
875 : :
876 : : /*
877 : : * Free everything by destroying the hash table's memory context.
878 : : */
8332 JanWieck@Yahoo.com 879 : 41706 : MemoryContextDelete(hashp->hcxt);
880 : : }
10141 scrappy@hub.org 881 : 41706 : }
882 : :
883 : : void
8227 tgl@sss.pgh.pa.us 884 : 41706 : hash_stats(const char *where, HTAB *hashp)
885 : : {
886 : : #ifdef HASH_STATISTICS
887 : : fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
888 : : where, hashp->hctl->accesses, hashp->hctl->collisions);
889 : :
890 : : fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
891 : : hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
892 : : hashp->hctl->max_bucket, hashp->hctl->nsegs);
893 : : fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
894 : : where, hash_accesses, hash_collisions);
895 : : fprintf(stderr, "hash_stats: total expansions %ld\n",
896 : : hash_expansions);
897 : : #endif
10141 scrappy@hub.org 898 : 41706 : }
899 : :
900 : : /*******************************SEARCH ROUTINES *****************************/
901 : :
902 : :
903 : : /*
904 : : * get_hash_value -- exported routine to calculate a key's hash value
905 : : *
906 : : * We export this because for partitioned tables, callers need to compute
907 : : * the partition number (from the low-order bits of the hash value) before
908 : : * searching.
909 : : */
910 : : uint32
6476 tgl@sss.pgh.pa.us 911 : 68948973 : get_hash_value(HTAB *hashp, const void *keyPtr)
912 : : {
913 : 68948973 : return hashp->hash(keyPtr, hashp->keysize);
914 : : }
915 : :
916 : : /* Convert a hash value to a bucket number */
917 : : static inline uint32
7544 918 : 160470833 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
919 : : {
920 : : uint32 bucket;
921 : :
9716 bruce@momjian.us 922 : 160470833 : bucket = hash_val & hctl->high_mask;
923 [ + + ]: 160470833 : if (bucket > hctl->max_bucket)
924 : 77078215 : bucket = bucket & hctl->low_mask;
925 : :
8072 tgl@sss.pgh.pa.us 926 : 160470833 : return bucket;
927 : : }
928 : :
929 : : /*
930 : : * hash_search -- look up key in table and perform action
931 : : * hash_search_with_hash_value -- same, with key's hash value already computed
932 : : *
933 : : * action is one of:
934 : : * HASH_FIND: look up key in table
935 : : * HASH_ENTER: look up key in table, creating entry if not present
936 : : * HASH_ENTER_NULL: same, but return NULL if out of memory
937 : : * HASH_REMOVE: look up key in table, remove entry if present
938 : : *
939 : : * Return value is a pointer to the element found/entered/removed if any,
940 : : * or NULL if no match was found. (NB: in the case of the REMOVE action,
941 : : * the result is a dangling pointer that shouldn't be dereferenced!)
942 : : *
943 : : * HASH_ENTER will normally ereport a generic "out of memory" error if
944 : : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
945 : : * the same except it will return NULL if out of memory.
946 : : *
947 : : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
948 : : * existing entry in the table, false otherwise. This is needed in the
949 : : * HASH_ENTER case, but is redundant with the return value otherwise.
950 : : *
951 : : * For hash_search_with_hash_value, the hashvalue parameter must have been
952 : : * calculated with get_hash_value().
953 : : */
954 : : void *
9715 bruce@momjian.us 955 : 97946946 : hash_search(HTAB *hashp,
956 : : const void *keyPtr,
957 : : HASHACTION action,
958 : : bool *foundPtr)
959 : : {
6476 tgl@sss.pgh.pa.us 960 : 97946946 : return hash_search_with_hash_value(hashp,
961 : : keyPtr,
962 : 97946946 : hashp->hash(keyPtr, hashp->keysize),
963 : : action,
964 : : foundPtr);
965 : : }
966 : :
967 : : void *
968 : 159970182 : hash_search_with_hash_value(HTAB *hashp,
969 : : const void *keyPtr,
970 : : uint32 hashvalue,
971 : : HASHACTION action,
972 : : bool *foundPtr)
973 : : {
8207 bruce@momjian.us 974 : 159970182 : HASHHDR *hctl = hashp->hctl;
2458 tgl@sss.pgh.pa.us 975 [ + + ]: 159970182 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
976 : : Size keysize;
977 : : HASHBUCKET currBucket;
978 : : HASHBUCKET *prevBucketPtr;
979 : : HashCompareFunc match;
980 : :
981 : : #ifdef HASH_STATISTICS
982 : : hash_accesses++;
983 : : hctl->accesses++;
984 : : #endif
985 : :
986 : : /*
987 : : * If inserting, check if it is time to split a bucket.
988 : : *
989 : : * NOTE: failure to expand table is not a fatal error, it just means we
990 : : * have to run at higher fill factor than we wanted. However, if we're
991 : : * using the palloc allocator then it will throw error anyway on
992 : : * out-of-memory, so we must do this before modifying the table.
993 : : */
4195 994 [ + + + + ]: 159970182 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
995 : : {
996 : : /*
997 : : * Can't split if running in partitioned mode, nor if frozen, nor if
998 : : * table is the subject of any active hash_seq_search scans.
999 : : */
1303 tmunro@postgresql.or 1000 [ + + ]: 39252909 : if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
1001 [ + - + - ]: 351581 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
4195 tgl@sss.pgh.pa.us 1002 [ + - ]: 351581 : !has_seq_scans(hashp))
1003 : 351581 : (void) expand_table(hashp);
1004 : : }
1005 : :
1006 : : /*
1007 : : * Do the initial lookup
1008 : : */
30 michael@paquier.xyz 1009 :GNC 159970182 : (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
6895 tgl@sss.pgh.pa.us 1010 :CBC 159970182 : currBucket = *prevBucketPtr;
1011 : :
1012 : : /*
1013 : : * Follow collision chain looking for matching key
1014 : : */
1015 : 159970182 : match = hashp->match; /* save one fetch in inner loop */
6476 1016 : 159970182 : keysize = hashp->keysize; /* ditto */
1017 : :
6895 1018 [ + + ]: 195456953 : while (currBucket != NULL)
1019 : : {
1020 [ + + + + ]: 288049311 : if (currBucket->hashvalue == hashvalue &&
1021 : 126282483 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1022 : 126280057 : break;
1023 : 35486771 : prevBucketPtr = &(currBucket->link);
8231 1024 : 35486771 : currBucket = *prevBucketPtr;
1025 : : #ifdef HASH_STATISTICS
1026 : : hash_collisions++;
1027 : : hctl->collisions++;
1028 : : #endif
1029 : : }
1030 : :
8227 1031 [ + + ]: 159970182 : if (foundPtr)
1032 : 40534764 : *foundPtr = (bool) (currBucket != NULL);
1033 : :
1034 : : /*
1035 : : * OK, now what?
1036 : : */
9716 bruce@momjian.us 1037 [ + + + + ]: 159970182 : switch (action)
1038 : : {
8227 tgl@sss.pgh.pa.us 1039 : 99228011 : case HASH_FIND:
8231 1040 [ + + ]: 99228011 : if (currBucket != NULL)
1041 : 91098070 : return (void *) ELEMENTKEY(currBucket);
8227 1042 : 8129941 : return NULL;
1043 : :
9715 bruce@momjian.us 1044 : 21489261 : case HASH_REMOVE:
8231 tgl@sss.pgh.pa.us 1045 [ + + ]: 21489261 : if (currBucket != NULL)
1046 : : {
1047 : : /* if partitioned, must lock to touch nentries and freeList */
3103 rhaas@postgresql.org 1048 [ + + ]: 21485372 : if (IS_PARTITIONED(hctl))
2944 1049 [ + + ]: 4686351 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1050 : :
1051 : : /* delete the record from the appropriate nentries counter. */
1052 [ - + ]: 21485372 : Assert(hctl->freeList[freelist_idx].nentries > 0);
1053 : 21485372 : hctl->freeList[freelist_idx].nentries--;
1054 : :
1055 : : /* remove record from hash bucket's chain. */
8231 tgl@sss.pgh.pa.us 1056 : 21485372 : *prevBucketPtr = currBucket->link;
1057 : :
1058 : : /* add the record to the appropriate freelist. */
2944 rhaas@postgresql.org 1059 : 21485372 : currBucket->link = hctl->freeList[freelist_idx].freeList;
1060 : 21485372 : hctl->freeList[freelist_idx].freeList = currBucket;
1061 : :
3103 1062 [ + + ]: 21485372 : if (IS_PARTITIONED(hctl))
2944 1063 : 4686351 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1064 : :
1065 : : /*
1066 : : * better hope the caller is synchronizing access to this
1067 : : * element, because someone else is going to reuse it the next
1068 : : * time something is added to the table
1069 : : */
8231 tgl@sss.pgh.pa.us 1070 : 21485372 : return (void *) ELEMENTKEY(currBucket);
1071 : : }
8227 1072 : 3889 : return NULL;
1073 : :
1074 : 39252909 : case HASH_ENTER:
1075 : : case HASH_ENTER_NULL:
1076 : : /* Return existing element if found, else create one */
8231 1077 [ + + ]: 39252909 : if (currBucket != NULL)
1078 : 13696614 : return (void *) ELEMENTKEY(currBucket);
1079 : :
1080 : : /* disallow inserts if frozen */
6198 1081 [ - + ]: 25556295 : if (hashp->frozen)
6060 tgl@sss.pgh.pa.us 1082 [ # # ]:UBC 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1083 : : hashp->tabname);
1084 : :
2944 rhaas@postgresql.org 1085 :CBC 25556295 : currBucket = get_hash_entry(hashp, freelist_idx);
8227 tgl@sss.pgh.pa.us 1086 [ - + ]: 25556295 : if (currBucket == NULL)
1087 : : {
1088 : : /* out of memory */
6476 tgl@sss.pgh.pa.us 1089 [ # # ]:UBC 0 : if (action == HASH_ENTER_NULL)
1090 : 0 : return NULL;
1091 : : /* report a generic message */
1092 [ # # ]: 0 : if (hashp->isshared)
1093 [ # # ]: 0 : ereport(ERROR,
1094 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1095 : : errmsg("out of shared memory")));
1096 : : else
1097 [ # # ]: 0 : ereport(ERROR,
1098 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1099 : : errmsg("out of memory")));
1100 : : }
1101 : :
1102 : : /* link into hashbucket chain */
8227 tgl@sss.pgh.pa.us 1103 :CBC 25556295 : *prevBucketPtr = currBucket;
1104 : 25556295 : currBucket->link = NULL;
1105 : :
1106 : : /* copy key into record */
7544 1107 : 25556295 : currBucket->hashvalue = hashvalue;
6875 1108 : 25556295 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1109 : :
1110 : : /*
1111 : : * Caller is expected to fill the data field on return. DO NOT
1112 : : * insert any code that could possibly throw error here, as doing
1113 : : * so would leave the table entry incomplete and hence corrupt the
1114 : : * caller's data structure.
1115 : : */
1116 : :
8227 1117 : 25556295 : return (void *) ELEMENTKEY(currBucket);
1118 : : }
1119 : :
7569 tgl@sss.pgh.pa.us 1120 [ - - ]:GBC 1 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1121 : :
1122 : : return NULL; /* keep compiler quiet */
1123 : : }
1124 : :
1125 : : /*
1126 : : * hash_update_hash_key -- change the hash key of an existing table entry
1127 : : *
1128 : : * This is equivalent to removing the entry, making a new entry, and copying
1129 : : * over its data, except that the entry never goes to the table's freelist.
1130 : : * Therefore this cannot suffer an out-of-memory failure, even if there are
1131 : : * other processes operating in other partitions of the hashtable.
1132 : : *
1133 : : * Returns true if successful, false if the requested new hash key is already
1134 : : * present. Throws error if the specified entry pointer isn't actually a
1135 : : * table member.
1136 : : *
1137 : : * NB: currently, there is no special case for old and new hash keys being
1138 : : * identical, which means we'll report false for that situation. This is
1139 : : * preferable for existing uses.
1140 : : *
1141 : : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1142 : : * partitions, if the new hash key would belong to a different partition.
1143 : : */
1144 : : bool
4109 tgl@sss.pgh.pa.us 1145 :CBC 868 : hash_update_hash_key(HTAB *hashp,
1146 : : void *existingEntry,
1147 : : const void *newKeyPtr)
1148 : : {
1149 : 868 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1150 : : uint32 newhashvalue;
1151 : : Size keysize;
1152 : : uint32 bucket;
1153 : : uint32 newbucket;
1154 : : HASHBUCKET currBucket;
1155 : : HASHBUCKET *prevBucketPtr;
1156 : : HASHBUCKET *oldPrevPtr;
1157 : : HashCompareFunc match;
1158 : :
1159 : : #ifdef HASH_STATISTICS
1160 : : hash_accesses++;
1161 : : hctl->accesses++;
1162 : : #endif
1163 : :
1164 : : /* disallow updates if frozen */
1165 [ - + ]: 868 : if (hashp->frozen)
4109 tgl@sss.pgh.pa.us 1166 [ # # ]:UBC 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1167 : : hashp->tabname);
1168 : :
1169 : : /*
1170 : : * Lookup the existing element using its saved hash value. We need to do
1171 : : * this to be able to unlink it from its hash chain, but as a side benefit
1172 : : * we can verify the validity of the passed existingEntry pointer.
1173 : : */
30 michael@paquier.xyz 1174 :GNC 868 : bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1175 : : &prevBucketPtr);
4109 tgl@sss.pgh.pa.us 1176 :CBC 868 : currBucket = *prevBucketPtr;
1177 : :
1178 [ + - ]: 869 : while (currBucket != NULL)
1179 : : {
1180 [ + + ]: 869 : if (currBucket == existingElement)
1181 : 868 : break;
1182 : 1 : prevBucketPtr = &(currBucket->link);
1183 : 1 : currBucket = *prevBucketPtr;
1184 : : }
1185 : :
1186 [ - + ]: 868 : if (currBucket == NULL)
4109 tgl@sss.pgh.pa.us 1187 [ # # ]:UBC 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1188 : : hashp->tabname);
1189 : :
4109 tgl@sss.pgh.pa.us 1190 :CBC 868 : oldPrevPtr = prevBucketPtr;
1191 : :
1192 : : /*
1193 : : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1194 : : * chain we want to put the entry into.
1195 : : */
1196 : 868 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
30 michael@paquier.xyz 1197 :GNC 868 : newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
4109 tgl@sss.pgh.pa.us 1198 :CBC 868 : currBucket = *prevBucketPtr;
1199 : :
1200 : : /*
1201 : : * Follow collision chain looking for matching key
1202 : : */
1203 : 868 : match = hashp->match; /* save one fetch in inner loop */
1204 : 868 : keysize = hashp->keysize; /* ditto */
1205 : :
1206 [ + + ]: 869 : while (currBucket != NULL)
1207 : : {
1208 [ - + - - ]: 1 : if (currBucket->hashvalue == newhashvalue &&
4109 tgl@sss.pgh.pa.us 1209 :UBC 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1210 : 0 : break;
4109 tgl@sss.pgh.pa.us 1211 :CBC 1 : prevBucketPtr = &(currBucket->link);
1212 : 1 : currBucket = *prevBucketPtr;
1213 : : #ifdef HASH_STATISTICS
1214 : : hash_collisions++;
1215 : : hctl->collisions++;
1216 : : #endif
1217 : : }
1218 : :
1219 [ - + ]: 868 : if (currBucket != NULL)
4109 tgl@sss.pgh.pa.us 1220 :UBC 0 : return false; /* collision with an existing entry */
1221 : :
4109 tgl@sss.pgh.pa.us 1222 :CBC 868 : currBucket = existingElement;
1223 : :
1224 : : /*
1225 : : * If old and new hash values belong to the same bucket, we need not
1226 : : * change any chain links, and indeed should not since this simplistic
1227 : : * update will corrupt the list if currBucket is the last element. (We
1228 : : * cannot fall out earlier, however, since we need to scan the bucket to
1229 : : * check for duplicate keys.)
1230 : : */
4108 1231 [ + - ]: 868 : if (bucket != newbucket)
1232 : : {
1233 : : /* OK to remove record from old hash bucket's chain. */
1234 : 868 : *oldPrevPtr = currBucket->link;
1235 : :
1236 : : /* link into new hashbucket chain */
1237 : 868 : *prevBucketPtr = currBucket;
1238 : 868 : currBucket->link = NULL;
1239 : : }
1240 : :
1241 : : /* copy new key into record */
4109 1242 : 868 : currBucket->hashvalue = newhashvalue;
1243 : 868 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1244 : :
1245 : : /* rest of record is untouched */
1246 : :
1247 : 868 : return true;
1248 : : }
1249 : :
1250 : : /*
1251 : : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1252 : : * (Or, if the underlying space allocator throws error for out-of-memory,
1253 : : * we won't return at all.)
1254 : : */
1255 : : static HASHBUCKET
2944 rhaas@postgresql.org 1256 : 25556295 : get_hash_entry(HTAB *hashp, int freelist_idx)
1257 : : {
1258 : 25556295 : HASHHDR *hctl = hashp->hctl;
1259 : : HASHBUCKET newElement;
1260 : :
1261 : : for (;;)
1262 : : {
1263 : : /* if partitioned, must lock to touch nentries and freeList */
3103 1264 [ + + ]: 25757699 : if (IS_PARTITIONED(hctl))
2944 1265 [ + + ]: 5173638 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1266 : :
1267 : : /* try to get an entry from the freelist */
1268 : 25757699 : newElement = hctl->freeList[freelist_idx].freeList;
1269 : :
6476 tgl@sss.pgh.pa.us 1270 [ + + ]: 25757699 : if (newElement != NULL)
1271 : 25556295 : break;
1272 : :
3103 rhaas@postgresql.org 1273 [ + + ]: 201404 : if (IS_PARTITIONED(hctl))
2944 1274 : 1402 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1275 : :
1276 : : /*
1277 : : * No free elements in this freelist. In a partitioned table, there
1278 : : * might be entries in other freelists, but to reduce contention we
1279 : : * prefer to first try to get another chunk of buckets from the main
1280 : : * shmem allocator. If that fails, though, we *MUST* root through all
1281 : : * the other freelists before giving up. There are multiple callers
1282 : : * that assume that they can allocate every element in the initially
1283 : : * requested table size, or that deleting an element guarantees they
1284 : : * can insert a new element, even if shared memory is entirely full.
1285 : : * Failing because the needed element is in a different freelist is
1286 : : * not acceptable.
1287 : : */
1288 [ - + ]: 201404 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1289 : : {
1290 : : int borrow_from_idx;
1291 : :
2944 rhaas@postgresql.org 1292 [ # # ]:UBC 0 : if (!IS_PARTITIONED(hctl))
1293 : 0 : return NULL; /* out of memory */
1294 : :
1295 : : /* try to borrow element from another freelist */
1296 : 0 : borrow_from_idx = freelist_idx;
1297 : : for (;;)
1298 : : {
1299 : 0 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1300 [ # # ]: 0 : if (borrow_from_idx == freelist_idx)
2458 tgl@sss.pgh.pa.us 1301 : 0 : break; /* examined all freelists, fail */
1302 : :
2944 rhaas@postgresql.org 1303 [ # # ]: 0 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1304 : 0 : newElement = hctl->freeList[borrow_from_idx].freeList;
1305 : :
1306 [ # # ]: 0 : if (newElement != NULL)
1307 : : {
1308 : 0 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1309 : 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1310 : :
1311 : : /* careful: count the new element in its proper freelist */
1312 [ # # ]: 0 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1313 : 0 : hctl->freeList[freelist_idx].nentries++;
1314 : 0 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1315 : :
2458 tgl@sss.pgh.pa.us 1316 : 0 : return newElement;
1317 : : }
1318 : :
2944 rhaas@postgresql.org 1319 : 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1320 : : }
1321 : :
1322 : : /* no elements available to borrow either, so out of memory */
2458 tgl@sss.pgh.pa.us 1323 : 0 : return NULL;
1324 : : }
1325 : : }
1326 : :
1327 : : /* remove entry from freelist, bump nentries */
2944 rhaas@postgresql.org 1328 :CBC 25556295 : hctl->freeList[freelist_idx].freeList = newElement->link;
1329 : 25556295 : hctl->freeList[freelist_idx].nentries++;
1330 : :
3103 1331 [ + + ]: 25556295 : if (IS_PARTITIONED(hctl))
2944 1332 : 5172236 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1333 : :
6476 tgl@sss.pgh.pa.us 1334 : 25556295 : return newElement;
1335 : : }
1336 : :
1337 : : /*
1338 : : * hash_get_num_entries -- get the number of entries in a hashtable
1339 : : */
1340 : : long
1341 : 33559 : hash_get_num_entries(HTAB *hashp)
1342 : : {
1343 : : int i;
2944 rhaas@postgresql.org 1344 : 33559 : long sum = hashp->hctl->freeList[0].nentries;
1345 : :
1346 : : /*
1347 : : * We currently don't bother with acquiring the mutexes; it's only
1348 : : * sensible to call this function if you've got lock on all partitions of
1349 : : * the table.
1350 : : */
2458 tgl@sss.pgh.pa.us 1351 [ + + ]: 33559 : if (IS_PARTITIONED(hashp->hctl))
1352 : : {
1353 [ + + ]: 44992 : for (i = 1; i < NUM_FREELISTS; i++)
1354 : 43586 : sum += hashp->hctl->freeList[i].nentries;
1355 : : }
1356 : :
2944 rhaas@postgresql.org 1357 : 33559 : return sum;
1358 : : }
1359 : :
1360 : : /*
1361 : : * hash_seq_init/_search/_term
1362 : : * Sequentially search through hash table and return
1363 : : * all the elements one by one, return NULL when no more.
1364 : : *
1365 : : * hash_seq_term should be called if and only if the scan is abandoned before
1366 : : * completion; if hash_seq_search returns NULL then it has already done the
1367 : : * end-of-scan cleanup.
1368 : : *
1369 : : * NOTE: caller may delete the returned element before continuing the scan.
1370 : : * However, deleting any other element while the scan is in progress is
1371 : : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1372 : : * if elements are added to the table while the scan is in progress, it is
1373 : : * unspecified whether they will be visited by the scan or not.
1374 : : *
1375 : : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1376 : : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1377 : : * further insertions by calling hash_freeze.
1378 : : *
1379 : : * NOTE: to use this with a partitioned hashtable, caller had better hold
1380 : : * at least shared lock on all partitions of the table throughout the scan!
1381 : : * We can cope with insertions or deletions by our own backend, but *not*
1382 : : * with concurrent insertions or deletions by another.
1383 : : */
1384 : : void
8503 tgl@sss.pgh.pa.us 1385 : 3170117 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1386 : : {
1387 : 3170117 : status->hashp = hashp;
1388 : 3170117 : status->curBucket = 0;
8231 1389 : 3170117 : status->curEntry = NULL;
6198 1390 [ + - ]: 3170117 : if (!hashp->frozen)
1391 : 3170117 : register_seq_scan(hashp);
8503 1392 : 3170117 : }
1393 : :
1394 : : void *
1395 : 37603005 : hash_seq_search(HASH_SEQ_STATUS *status)
1396 : : {
1397 : : HTAB *hashp;
1398 : : HASHHDR *hctl;
1399 : : uint32 max_bucket;
1400 : : long ssize;
1401 : : long segment_num;
1402 : : long segment_ndx;
1403 : : HASHSEGMENT segp;
1404 : : uint32 curBucket;
1405 : : HASHELEMENT *curElem;
1406 : :
6918 1407 [ + + ]: 37603005 : if ((curElem = status->curEntry) != NULL)
1408 : : {
1409 : : /* Continuing scan of curBucket... */
1410 : 8471108 : status->curEntry = curElem->link;
6756 bruce@momjian.us 1411 [ + + ]: 8471108 : if (status->curEntry == NULL) /* end of this bucket */
6918 tgl@sss.pgh.pa.us 1412 : 6235225 : ++status->curBucket;
1413 : 8471108 : return (void *) ELEMENTKEY(curElem);
1414 : : }
1415 : :
1416 : : /*
1417 : : * Search for next nonempty bucket starting at curBucket.
1418 : : */
1419 : 29131897 : curBucket = status->curBucket;
1420 : 29131897 : hashp = status->hashp;
1421 : 29131897 : hctl = hashp->hctl;
6476 1422 : 29131897 : ssize = hashp->ssize;
6918 1423 : 29131897 : max_bucket = hctl->max_bucket;
1424 : :
1425 [ + + ]: 29131897 : if (curBucket > max_bucket)
1426 : : {
6198 1427 : 158410 : hash_seq_term(status);
6756 bruce@momjian.us 1428 : 158410 : return NULL; /* search is done */
1429 : : }
1430 : :
1431 : : /*
1432 : : * first find the right segment in the table directory.
1433 : : */
6476 tgl@sss.pgh.pa.us 1434 : 28973487 : segment_num = curBucket >> hashp->sshift;
6918 1435 : 28973487 : segment_ndx = MOD(curBucket, ssize);
1436 : :
1437 : 28973487 : segp = hashp->dir[segment_num];
1438 : :
1439 : : /*
1440 : : * Pick up the first item in this bucket's chain. If chain is not empty
1441 : : * we can begin searching it. Otherwise we have to advance to find the
1442 : : * next nonempty bucket. We try to optimize that case since searching a
1443 : : * near-empty hashtable has to iterate this loop a lot.
1444 : : */
1445 [ + + ]: 274329003 : while ((curElem = segp[segment_ndx]) == NULL)
1446 : : {
1447 : : /* empty bucket, advance to next */
1448 [ + + ]: 248351109 : if (++curBucket > max_bucket)
1449 : : {
1450 : 2995593 : status->curBucket = curBucket;
6198 1451 : 2995593 : hash_seq_term(status);
6756 bruce@momjian.us 1452 : 2995593 : return NULL; /* search is done */
1453 : : }
6918 tgl@sss.pgh.pa.us 1454 [ + + ]: 245355516 : if (++segment_ndx >= ssize)
1455 : : {
1456 : 101674 : segment_num++;
1457 : 101674 : segment_ndx = 0;
1458 : 101674 : segp = hashp->dir[segment_num];
1459 : : }
1460 : : }
1461 : :
1462 : : /* Begin scan of curBucket... */
1463 : 25977894 : status->curEntry = curElem->link;
2489 1464 [ + + ]: 25977894 : if (status->curEntry == NULL) /* end of this bucket */
6918 1465 : 19742617 : ++curBucket;
1466 : 25977894 : status->curBucket = curBucket;
1467 : 25977894 : return (void *) ELEMENTKEY(curElem);
1468 : : }
1469 : :
1470 : : void
6198 1471 : 3170107 : hash_seq_term(HASH_SEQ_STATUS *status)
1472 : : {
1473 [ + - ]: 3170107 : if (!status->hashp->frozen)
1474 : 3170107 : deregister_seq_scan(status->hashp);
1475 : 3170107 : }
1476 : :
1477 : : /*
1478 : : * hash_freeze
1479 : : * Freeze a hashtable against future insertions (deletions are
1480 : : * still allowed)
1481 : : *
1482 : : * The reason for doing this is that by preventing any more bucket splits,
1483 : : * we no longer need to worry about registering hash_seq_search scans,
1484 : : * and thus caller need not be careful about ensuring hash_seq_term gets
1485 : : * called at the right times.
1486 : : *
1487 : : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1488 : : * with active scans (since hash_seq_term would then do the wrong thing).
1489 : : */
1490 : : void
6198 tgl@sss.pgh.pa.us 1491 :UBC 0 : hash_freeze(HTAB *hashp)
1492 : : {
1493 [ # # ]: 0 : if (hashp->isshared)
6060 1494 [ # # ]: 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
6198 1495 [ # # # # ]: 0 : if (!hashp->frozen && has_seq_scans(hashp))
6060 1496 [ # # ]: 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1497 : : hashp->tabname);
6198 1498 : 0 : hashp->frozen = true;
1499 : 0 : }
1500 : :
1501 : :
1502 : : /********************************* UTILITIES ************************/
1503 : :
1504 : : /*
1505 : : * Expand the table by adding one more hash bucket.
1506 : : */
1507 : : static bool
9715 bruce@momjian.us 1508 :CBC 351581 : expand_table(HTAB *hashp)
1509 : : {
8207 1510 : 351581 : HASHHDR *hctl = hashp->hctl;
1511 : : HASHSEGMENT old_seg,
1512 : : new_seg;
1513 : : long old_bucket,
1514 : : new_bucket;
1515 : : long new_segnum,
1516 : : new_segndx;
1517 : : long old_segnum,
1518 : : old_segndx;
1519 : : HASHBUCKET *oldlink,
1520 : : *newlink;
1521 : : HASHBUCKET currElement,
1522 : : nextElement;
1523 : :
6476 tgl@sss.pgh.pa.us 1524 [ - + ]: 351581 : Assert(!IS_PARTITIONED(hctl));
1525 : :
1526 : : #ifdef HASH_STATISTICS
1527 : : hash_expansions++;
1528 : : #endif
1529 : :
9183 1530 : 351581 : new_bucket = hctl->max_bucket + 1;
6476 1531 : 351581 : new_segnum = new_bucket >> hashp->sshift;
1532 : 351581 : new_segndx = MOD(new_bucket, hashp->ssize);
1533 : :
9716 bruce@momjian.us 1534 [ + + ]: 351581 : if (new_segnum >= hctl->nsegs)
1535 : : {
1536 : : /* Allocate new segment if necessary -- could fail if dir full */
1537 [ - + ]: 1213 : if (new_segnum >= hctl->dsize)
9091 bruce@momjian.us 1538 [ # # ]:UBC 0 : if (!dir_realloc(hashp))
8231 tgl@sss.pgh.pa.us 1539 : 0 : return false;
9716 bruce@momjian.us 1540 [ - + ]:CBC 1213 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
8231 tgl@sss.pgh.pa.us 1541 :UBC 0 : return false;
9716 bruce@momjian.us 1542 :CBC 1213 : hctl->nsegs++;
1543 : : }
1544 : :
1545 : : /* OK, we created a new bucket */
9183 tgl@sss.pgh.pa.us 1546 : 351581 : hctl->max_bucket++;
1547 : :
1548 : : /*
1549 : : * *Before* changing masks, find old bucket corresponding to same hash
1550 : : * values; values in that bucket may need to be relocated to new bucket.
1551 : : * Note that new_bucket is certainly larger than low_mask at this point,
1552 : : * so we can skip the first step of the regular hash mask calc.
1553 : : */
9085 1554 : 351581 : old_bucket = (new_bucket & hctl->low_mask);
1555 : :
1556 : : /*
1557 : : * If we crossed a power of 2, readjust masks.
1558 : : */
8072 1559 [ + + ]: 351581 : if ((uint32) new_bucket > hctl->high_mask)
1560 : : {
9716 bruce@momjian.us 1561 : 1964 : hctl->low_mask = hctl->high_mask;
8072 tgl@sss.pgh.pa.us 1562 : 1964 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1563 : : }
1564 : :
1565 : : /*
1566 : : * Relocate records to the new bucket. NOTE: because of the way the hash
1567 : : * masking is done in calc_bucket, only one old bucket can need to be
1568 : : * split at this point. With a different way of reducing the hash value,
1569 : : * that might not be true!
1570 : : */
6476 1571 : 351581 : old_segnum = old_bucket >> hashp->sshift;
1572 : 351581 : old_segndx = MOD(old_bucket, hashp->ssize);
1573 : :
8231 1574 : 351581 : old_seg = hashp->dir[old_segnum];
1575 : 351581 : new_seg = hashp->dir[new_segnum];
1576 : :
1577 : 351581 : oldlink = &old_seg[old_segndx];
1578 : 351581 : newlink = &new_seg[new_segndx];
1579 : :
1580 : 351581 : for (currElement = *oldlink;
1581 [ + + ]: 850496 : currElement != NULL;
1582 : 498915 : currElement = nextElement)
1583 : : {
1584 : 498915 : nextElement = currElement->link;
7544 1585 [ + + ]: 498915 : if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1586 : : {
8231 1587 : 249128 : *oldlink = currElement;
1588 : 249128 : oldlink = &currElement->link;
1589 : : }
1590 : : else
1591 : : {
1592 : 249787 : *newlink = currElement;
1593 : 249787 : newlink = &currElement->link;
1594 : : }
1595 : : }
1596 : : /* don't forget to terminate the rebuilt hash chains... */
1597 : 351581 : *oldlink = NULL;
1598 : 351581 : *newlink = NULL;
1599 : :
1600 : 351581 : return true;
1601 : : }
1602 : :
1603 : :
1604 : : static bool
9715 bruce@momjian.us 1605 :UBC 0 : dir_realloc(HTAB *hashp)
1606 : : {
1607 : : HASHSEGMENT *p;
1608 : : HASHSEGMENT *old_p;
1609 : : long new_dsize;
1610 : : long old_dirsize;
1611 : : long new_dirsize;
1612 : :
9716 1613 [ # # ]: 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
8231 tgl@sss.pgh.pa.us 1614 : 0 : return false;
1615 : :
1616 : : /* Reallocate directory */
9183 1617 : 0 : new_dsize = hashp->hctl->dsize << 1;
8231 1618 : 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1619 : 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1620 : :
1621 : 0 : old_p = hashp->dir;
8332 JanWieck@Yahoo.com 1622 : 0 : CurrentDynaHashCxt = hashp->hcxt;
8231 tgl@sss.pgh.pa.us 1623 : 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1624 : :
9716 bruce@momjian.us 1625 [ # # ]: 0 : if (p != NULL)
1626 : : {
8231 tgl@sss.pgh.pa.us 1627 : 0 : memcpy(p, old_p, old_dirsize);
1628 [ # # # # : 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
# # # # #
# ]
1629 : 0 : hashp->dir = p;
9183 1630 : 0 : hashp->hctl->dsize = new_dsize;
1631 : :
1632 : : /* XXX assume the allocator is palloc, so we know how to free */
6918 1633 [ # # ]: 0 : Assert(hashp->alloc == DynaHashAlloc);
1634 : 0 : pfree(old_p);
1635 : :
8231 1636 : 0 : return true;
1637 : : }
1638 : :
1639 : 0 : return false;
1640 : : }
1641 : :
1642 : :
1643 : : static HASHSEGMENT
9715 bruce@momjian.us 1644 :CBC 411400 : seg_alloc(HTAB *hashp)
1645 : : {
1646 : : HASHSEGMENT segp;
1647 : :
8332 JanWieck@Yahoo.com 1648 : 411400 : CurrentDynaHashCxt = hashp->hcxt;
6476 tgl@sss.pgh.pa.us 1649 : 411400 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1650 : :
9716 bruce@momjian.us 1651 [ - + ]: 411400 : if (!segp)
8231 tgl@sss.pgh.pa.us 1652 :UBC 0 : return NULL;
1653 : :
6476 tgl@sss.pgh.pa.us 1654 [ + - + - :CBC 411400 : MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
+ - - + -
- ]
1655 : :
8231 1656 : 411400 : return segp;
1657 : : }
1658 : :
1659 : : /*
1660 : : * allocate some new elements and link them into the indicated free list
1661 : : */
1662 : : static bool
2944 rhaas@postgresql.org 1663 : 422772 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1664 : : {
1665 : 422772 : HASHHDR *hctl = hashp->hctl;
1666 : : Size elementSize;
1667 : : HASHELEMENT *firstElement;
1668 : : HASHELEMENT *tmpElement;
1669 : : HASHELEMENT *prevElement;
1670 : : int i;
1671 : :
4752 heikki.linnakangas@i 1672 [ - + ]: 422772 : if (hashp->isfixed)
4752 heikki.linnakangas@i 1673 :UBC 0 : return false;
1674 : :
1675 : : /* Each element has a HASHELEMENT header plus user data. */
3103 rhaas@postgresql.org 1676 :CBC 422772 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1677 : :
8332 JanWieck@Yahoo.com 1678 : 422772 : CurrentDynaHashCxt = hashp->hcxt;
6476 tgl@sss.pgh.pa.us 1679 : 422772 : firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1680 : :
1681 [ - + ]: 422772 : if (!firstElement)
8231 tgl@sss.pgh.pa.us 1682 :UBC 0 : return false;
1683 : :
1684 : : /* prepare to link all the new entries into the freelist */
6476 tgl@sss.pgh.pa.us 1685 :CBC 422772 : prevElement = NULL;
1686 : 422772 : tmpElement = firstElement;
7138 1687 [ + + ]: 39768317 : for (i = 0; i < nelem; i++)
1688 : : {
6476 1689 : 39345545 : tmpElement->link = prevElement;
1690 : 39345545 : prevElement = tmpElement;
8231 1691 : 39345545 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1692 : : }
1693 : :
1694 : : /* if partitioned, must lock to touch freeList */
3103 rhaas@postgresql.org 1695 [ + + ]: 422772 : if (IS_PARTITIONED(hctl))
2944 1696 [ - + ]: 145082 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1697 : :
1698 : : /* freelist could be nonempty if two backends did this concurrently */
1699 : 422772 : firstElement->link = hctl->freeList[freelist_idx].freeList;
1700 : 422772 : hctl->freeList[freelist_idx].freeList = prevElement;
1701 : :
3103 1702 [ + + ]: 422772 : if (IS_PARTITIONED(hctl))
2944 1703 : 145082 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1704 : :
8231 tgl@sss.pgh.pa.us 1705 : 422772 : return true;
1706 : : }
1707 : :
1708 : : /*
1709 : : * Do initial lookup of a bucket for the given hash value, retrieving its
1710 : : * bucket number and its hash bucket.
1711 : : */
1712 : : static inline uint32
30 michael@paquier.xyz 1713 :GNC 159971918 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
1714 : : {
1715 : 159971918 : HASHHDR *hctl = hashp->hctl;
1716 : : HASHSEGMENT segp;
1717 : : long segment_num;
1718 : : long segment_ndx;
1719 : : uint32 bucket;
1720 : :
1721 : 159971918 : bucket = calc_bucket(hctl, hashvalue);
1722 : :
1723 : 159971918 : segment_num = bucket >> hashp->sshift;
1724 : 159971918 : segment_ndx = MOD(bucket, hashp->ssize);
1725 : :
1726 : 159971918 : segp = hashp->dir[segment_num];
1727 : :
1728 [ - + ]: 159971918 : if (segp == NULL)
30 michael@paquier.xyz 1729 :UNC 0 : hash_corrupted(hashp);
1730 : :
30 michael@paquier.xyz 1731 :GNC 159971918 : *bucketptr = &segp[segment_ndx];
1732 : 159971918 : return bucket;
1733 : : }
1734 : :
1735 : : /* complain when we have detected a corrupted hashtable */
1736 : : static void
8227 tgl@sss.pgh.pa.us 1737 :UBC 0 : hash_corrupted(HTAB *hashp)
1738 : : {
1739 : : /*
1740 : : * If the corruption is in a shared hashtable, we'd better force a
1741 : : * systemwide restart. Otherwise, just shut down this one backend.
1742 : : */
1743 [ # # ]: 0 : if (hashp->isshared)
7569 1744 [ # # ]: 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1745 : : else
1746 [ # # ]: 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1747 : : }
1748 : :
1749 : : /* calculate ceil(log base 2) of num */
1750 : : int
10141 scrappy@hub.org 1751 :CBC 493420 : my_log2(long num)
1752 : : {
1753 : : /*
1754 : : * guard against too-large input, which would be invalid for
1755 : : * pg_ceil_log2_*()
1756 : : */
4142 tgl@sss.pgh.pa.us 1757 [ - + ]: 493420 : if (num > LONG_MAX / 2)
4142 tgl@sss.pgh.pa.us 1758 :UBC 0 : num = LONG_MAX / 2;
1759 : :
1760 : : #if SIZEOF_LONG < 8
1761 : : return pg_ceil_log2_32(num);
1762 : : #else
1467 drowley@postgresql.o 1763 :CBC 493420 : return pg_ceil_log2_64(num);
1764 : : #endif
1765 : : }
1766 : :
1767 : : /* calculate first power of 2 >= num, bounded to what will fit in a long */
1768 : : static long
4142 tgl@sss.pgh.pa.us 1769 : 51564 : next_pow2_long(long num)
1770 : : {
1771 : : /* my_log2's internal range check is sufficient */
1772 : 51564 : return 1L << my_log2(num);
1773 : : }
1774 : :
1775 : : /* calculate first power of 2 >= num, bounded to what will fit in an int */
1776 : : static int
1777 : 422342 : next_pow2_int(long num)
1778 : : {
1779 [ - + ]: 422342 : if (num > INT_MAX / 2)
4142 tgl@sss.pgh.pa.us 1780 :UBC 0 : num = INT_MAX / 2;
4142 tgl@sss.pgh.pa.us 1781 :CBC 422342 : return 1 << my_log2(num);
1782 : : }
1783 : :
1784 : :
1785 : : /************************* SEQ SCAN TRACKING ************************/
1786 : :
1787 : : /*
1788 : : * We track active hash_seq_search scans here. The need for this mechanism
1789 : : * comes from the fact that a scan will get confused if a bucket split occurs
1790 : : * while it's in progress: it might visit entries twice, or even miss some
1791 : : * entirely (if it's partway through the same bucket that splits). Hence
1792 : : * we want to inhibit bucket splits if there are any active scans on the
1793 : : * table being inserted into. This is a fairly rare case in current usage,
1794 : : * so just postponing the split until the next insertion seems sufficient.
1795 : : *
1796 : : * Given present usages of the function, only a few scans are likely to be
1797 : : * open concurrently; so a finite-size stack of open scans seems sufficient,
1798 : : * and we don't worry that linear search is too slow. Note that we do
1799 : : * allow multiple scans of the same hashtable to be open concurrently.
1800 : : *
1801 : : * This mechanism can support concurrent scan and insertion in a shared
1802 : : * hashtable if it's the same backend doing both. It would fail otherwise,
1803 : : * but locking reasons seem to preclude any such scenario anyway, so we don't
1804 : : * worry.
1805 : : *
1806 : : * This arrangement is reasonably robust if a transient hashtable is deleted
1807 : : * without notifying us. The absolute worst case is we might inhibit splits
1808 : : * in another table created later at exactly the same address. We will give
1809 : : * a warning at transaction end for reference leaks, so any bugs leading to
1810 : : * lack of notification should be easy to catch.
1811 : : */
1812 : :
1813 : : #define MAX_SEQ_SCANS 100
1814 : :
1815 : : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1816 : : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1817 : : static int num_seq_scans = 0;
1818 : :
1819 : :
1820 : : /* Register a table as having an active hash_seq_search scan */
1821 : : static void
6198 1822 : 3170117 : register_seq_scan(HTAB *hashp)
1823 : : {
1824 [ - + ]: 3170117 : if (num_seq_scans >= MAX_SEQ_SCANS)
6060 tgl@sss.pgh.pa.us 1825 [ # # ]:UBC 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1826 : : hashp->tabname);
6198 tgl@sss.pgh.pa.us 1827 :CBC 3170117 : seq_scan_tables[num_seq_scans] = hashp;
1828 : 3170117 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1829 : 3170117 : num_seq_scans++;
1830 : 3170117 : }
1831 : :
1832 : : /* Deregister an active scan */
1833 : : static void
1834 : 3170107 : deregister_seq_scan(HTAB *hashp)
1835 : : {
1836 : : int i;
1837 : :
1838 : : /* Search backward since it's most likely at the stack top */
1839 [ + - ]: 3170107 : for (i = num_seq_scans - 1; i >= 0; i--)
1840 : : {
1841 [ + - ]: 3170107 : if (seq_scan_tables[i] == hashp)
1842 : : {
1843 : 3170107 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1844 : 3170107 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1845 : 3170107 : num_seq_scans--;
1846 : 3170107 : return;
1847 : : }
1848 : : }
6198 tgl@sss.pgh.pa.us 1849 [ # # ]:UBC 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1850 : : hashp->tabname);
1851 : : }
1852 : :
1853 : : /* Check if a table has any active scan */
1854 : : static bool
6198 tgl@sss.pgh.pa.us 1855 :CBC 351581 : has_seq_scans(HTAB *hashp)
1856 : : {
1857 : : int i;
1858 : :
1859 [ - + ]: 351581 : for (i = 0; i < num_seq_scans; i++)
1860 : : {
6198 tgl@sss.pgh.pa.us 1861 [ # # ]:UBC 0 : if (seq_scan_tables[i] == hashp)
1862 : 0 : return true;
1863 : : }
6198 tgl@sss.pgh.pa.us 1864 :CBC 351581 : return false;
1865 : : }
1866 : :
1867 : : /* Clean up any open scans at end of transaction */
1868 : : void
1869 : 433156 : AtEOXact_HashTables(bool isCommit)
1870 : : {
1871 : : /*
1872 : : * During abort cleanup, open scans are expected; just silently clean 'em
1873 : : * out. An open scan at commit means someone forgot a hash_seq_term()
1874 : : * call, so complain.
1875 : : *
1876 : : * Note: it's tempting to try to print the tabname here, but refrain for
1877 : : * fear of touching deallocated memory. This isn't a user-facing message
1878 : : * anyway, so it needn't be pretty.
1879 : : */
1880 [ + + ]: 433156 : if (isCommit)
1881 : : {
1882 : : int i;
1883 : :
1884 [ - + ]: 410325 : for (i = 0; i < num_seq_scans; i++)
1885 : : {
6198 tgl@sss.pgh.pa.us 1886 [ # # ]:UBC 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1887 : : seq_scan_tables[i]);
1888 : : }
1889 : : }
6198 tgl@sss.pgh.pa.us 1890 :CBC 433156 : num_seq_scans = 0;
1891 : 433156 : }
1892 : :
1893 : : /* Clean up any open scans at end of subtransaction */
1894 : : void
1895 : 9935 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1896 : : {
1897 : : int i;
1898 : :
1899 : : /*
1900 : : * Search backward to make cleanup easy. Note we must check all entries,
1901 : : * not only those at the end of the array, because deletion technique
1902 : : * doesn't keep them in order.
1903 : : */
1904 [ - + ]: 9935 : for (i = num_seq_scans - 1; i >= 0; i--)
1905 : : {
6198 tgl@sss.pgh.pa.us 1906 [ # # ]:UBC 0 : if (seq_scan_level[i] >= nestDepth)
1907 : : {
1908 [ # # ]: 0 : if (isCommit)
1909 [ # # ]: 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1910 : : seq_scan_tables[i]);
1911 : 0 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1912 : 0 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1913 : 0 : num_seq_scans--;
1914 : : }
1915 : : }
6198 tgl@sss.pgh.pa.us 1916 :CBC 9935 : }
|