TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashfunc.c
4 : * Support functions for hash access method.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashfunc.c
12 : *
13 : * NOTES
14 : * These functions are stored in pg_amproc. For each operator class
15 : * defined for hash indexes, they compute the hash value of the argument.
16 : *
17 : * Additional hash functions appear in /utils/adt/ files for various
18 : * specialized datatypes.
19 : *
20 : * It is expected that every bit of a hash function's 32-bit result is
21 : * as random as every other; failure to ensure this is likely to lead
22 : * to poor performance of hash joins, for example. In most cases a hash
23 : * function should use hash_any() or its variant hash_uint32().
24 : *-------------------------------------------------------------------------
25 : */
26 :
27 : #include "postgres.h"
28 :
29 : #include "access/hash.h"
30 : #include "catalog/pg_collation.h"
31 : #include "common/hashfn.h"
32 : #include "utils/builtins.h"
33 : #include "utils/float.h"
34 : #include "utils/pg_locale.h"
35 : #include "varatt.h"
36 :
37 : /*
38 : * Datatype-specific hash functions.
39 : *
40 : * These support both hash indexes and hash joins.
41 : *
42 : * NOTE: some of these are also used by catcache operations, without
43 : * any direct connection to hash indexes. Also, the common hash_any
44 : * routine is also used by dynahash tables.
45 : */
46 :
47 : /* Note: this is used for both "char" and boolean datatypes */
48 : Datum
49 GIC 235744 : hashchar(PG_FUNCTION_ARGS)
50 ECB : {
51 GIC 235744 : return hash_uint32((int32) PG_GETARG_CHAR(0));
52 ECB : }
53 :
54 : Datum
55 GIC 33 : hashcharextended(PG_FUNCTION_ARGS)
56 ECB : {
57 GIC 33 : return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1));
58 ECB : }
59 :
60 : Datum
61 GIC 229705 : hashint2(PG_FUNCTION_ARGS)
62 ECB : {
63 GIC 229705 : return hash_uint32((int32) PG_GETARG_INT16(0));
64 ECB : }
65 :
66 : Datum
67 GIC 24 : hashint2extended(PG_FUNCTION_ARGS)
68 ECB : {
69 GIC 24 : return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1));
70 ECB : }
71 :
72 : Datum
73 GIC 12937844 : hashint4(PG_FUNCTION_ARGS)
74 ECB : {
75 GIC 12937844 : return hash_uint32(PG_GETARG_INT32(0));
76 ECB : }
77 :
78 : Datum
79 GIC 103557 : hashint4extended(PG_FUNCTION_ARGS)
80 ECB : {
81 GIC 103557 : return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1));
82 ECB : }
83 :
84 : Datum
85 GIC 307240 : hashint8(PG_FUNCTION_ARGS)
86 ECB : {
87 : /*
88 : * The idea here is to produce a hash value compatible with the values
89 : * produced by hashint4 and hashint2 for logically equal inputs; this is
90 : * necessary to support cross-type hash joins across these input types.
91 : * Since all three types are signed, we can xor the high half of the int8
92 : * value if the sign is positive, or the complement of the high half when
93 : * the sign is negative.
94 : */
95 GIC 307240 : int64 val = PG_GETARG_INT64(0);
96 CBC 307240 : uint32 lohalf = (uint32) val;
97 307240 : uint32 hihalf = (uint32) (val >> 32);
98 ECB :
99 GIC 307240 : lohalf ^= (val >= 0) ? hihalf : ~hihalf;
100 ECB :
101 GIC 307240 : return hash_uint32(lohalf);
102 ECB : }
103 :
104 : Datum
105 GIC 186 : hashint8extended(PG_FUNCTION_ARGS)
106 ECB : {
107 : /* Same approach as hashint8 */
108 GIC 186 : int64 val = PG_GETARG_INT64(0);
109 CBC 186 : uint32 lohalf = (uint32) val;
110 186 : uint32 hihalf = (uint32) (val >> 32);
111 ECB :
112 GIC 186 : lohalf ^= (val >= 0) ? hihalf : ~hihalf;
113 ECB :
114 GIC 186 : return hash_uint32_extended(lohalf, PG_GETARG_INT64(1));
115 ECB : }
116 :
117 : Datum
118 GIC 7662999 : hashoid(PG_FUNCTION_ARGS)
119 ECB : {
120 GIC 7662999 : return hash_uint32((uint32) PG_GETARG_OID(0));
121 ECB : }
122 :
123 : Datum
124 GIC 36 : hashoidextended(PG_FUNCTION_ARGS)
125 ECB : {
126 GIC 36 : return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
127 ECB : }
128 :
129 : Datum
130 GIC 1571 : hashenum(PG_FUNCTION_ARGS)
131 ECB : {
132 GIC 1571 : return hash_uint32((uint32) PG_GETARG_OID(0));
133 ECB : }
134 :
135 : Datum
136 GIC 3018 : hashenumextended(PG_FUNCTION_ARGS)
137 ECB : {
138 GIC 3018 : return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
139 ECB : }
140 :
141 : Datum
142 GIC 21155 : hashfloat4(PG_FUNCTION_ARGS)
143 ECB : {
144 GIC 21155 : float4 key = PG_GETARG_FLOAT4(0);
145 ECB : float8 key8;
146 :
147 : /*
148 : * On IEEE-float machines, minus zero and zero have different bit patterns
149 : * but should compare as equal. We must ensure that they have the same
150 : * hash value, which is most reliably done this way:
151 : */
152 GIC 21155 : if (key == (float4) 0)
153 CBC 12 : PG_RETURN_UINT32(0);
154 ECB :
155 : /*
156 : * To support cross-type hashing of float8 and float4, we want to return
157 : * the same hash value hashfloat8 would produce for an equal float8 value.
158 : * So, widen the value to float8 and hash that. (We must do this rather
159 : * than have hashfloat8 try to narrow its value to float4; that could fail
160 : * on overflow.)
161 : */
162 GIC 21143 : key8 = key;
163 ECB :
164 : /*
165 : * Similarly, NaNs can have different bit patterns but they should all
166 : * compare as equal. For backwards-compatibility reasons we force them to
167 : * have the hash value of a standard float8 NaN. (You'd think we could
168 : * replace key with a float4 NaN and then widen it; but on some old
169 : * platforms, that way produces a different bit pattern.)
170 : */
171 GIC 21143 : if (isnan(key8))
172 CBC 9 : key8 = get_float8_nan();
173 ECB :
174 GIC 21143 : return hash_any((unsigned char *) &key8, sizeof(key8));
175 ECB : }
176 :
177 : Datum
178 GIC 36 : hashfloat4extended(PG_FUNCTION_ARGS)
179 ECB : {
180 GIC 36 : float4 key = PG_GETARG_FLOAT4(0);
181 CBC 36 : uint64 seed = PG_GETARG_INT64(1);
182 ECB : float8 key8;
183 :
184 : /* Same approach as hashfloat4 */
185 GIC 36 : if (key == (float4) 0)
186 CBC 6 : PG_RETURN_UINT64(seed);
187 30 : key8 = key;
188 30 : if (isnan(key8))
189 LBC 0 : key8 = get_float8_nan();
190 EUB :
191 GIC 30 : return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed);
192 ECB : }
193 :
194 : Datum
195 GIC 61783 : hashfloat8(PG_FUNCTION_ARGS)
196 ECB : {
197 GIC 61783 : float8 key = PG_GETARG_FLOAT8(0);
198 ECB :
199 : /*
200 : * On IEEE-float machines, minus zero and zero have different bit patterns
201 : * but should compare as equal. We must ensure that they have the same
202 : * hash value, which is most reliably done this way:
203 : */
204 GIC 61783 : if (key == (float8) 0)
205 CBC 355 : PG_RETURN_UINT32(0);
206 ECB :
207 : /*
208 : * Similarly, NaNs can have different bit patterns but they should all
209 : * compare as equal. For backwards-compatibility reasons we force them to
210 : * have the hash value of a standard NaN.
211 : */
212 GIC 61428 : if (isnan(key))
213 CBC 9 : key = get_float8_nan();
214 ECB :
215 GIC 61428 : return hash_any((unsigned char *) &key, sizeof(key));
216 ECB : }
217 :
218 : Datum
219 GIC 36 : hashfloat8extended(PG_FUNCTION_ARGS)
220 ECB : {
221 GIC 36 : float8 key = PG_GETARG_FLOAT8(0);
222 CBC 36 : uint64 seed = PG_GETARG_INT64(1);
223 ECB :
224 : /* Same approach as hashfloat8 */
225 GIC 36 : if (key == (float8) 0)
226 CBC 6 : PG_RETURN_UINT64(seed);
227 30 : if (isnan(key))
228 LBC 0 : key = get_float8_nan();
229 EUB :
230 GIC 30 : return hash_any_extended((unsigned char *) &key, sizeof(key), seed);
231 ECB : }
232 :
233 : Datum
234 GIC 460534 : hashoidvector(PG_FUNCTION_ARGS)
235 ECB : {
236 GIC 460534 : oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
237 ECB :
238 GIC 460534 : return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
239 ECB : }
240 :
241 : Datum
242 GIC 30 : hashoidvectorextended(PG_FUNCTION_ARGS)
243 ECB : {
244 GIC 30 : oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
245 ECB :
246 GIC 60 : return hash_any_extended((unsigned char *) key->values,
247 CBC 30 : key->dim1 * sizeof(Oid),
248 30 : PG_GETARG_INT64(1));
249 ECB : }
250 :
251 : Datum
252 GIC 249601 : hashname(PG_FUNCTION_ARGS)
253 ECB : {
254 GIC 249601 : char *key = NameStr(*PG_GETARG_NAME(0));
255 ECB :
256 GIC 249601 : return hash_any((unsigned char *) key, strlen(key));
257 ECB : }
258 :
259 : Datum
260 GIC 30 : hashnameextended(PG_FUNCTION_ARGS)
261 ECB : {
262 GIC 30 : char *key = NameStr(*PG_GETARG_NAME(0));
263 ECB :
264 GIC 30 : return hash_any_extended((unsigned char *) key, strlen(key),
265 CBC 30 : PG_GETARG_INT64(1));
266 ECB : }
267 :
268 : Datum
269 GIC 1007924 : hashtext(PG_FUNCTION_ARGS)
270 ECB : {
271 GIC 1007924 : text *key = PG_GETARG_TEXT_PP(0);
272 CBC 1007924 : Oid collid = PG_GET_COLLATION();
273 1007924 : pg_locale_t mylocale = 0;
274 ECB : Datum result;
275 :
276 GIC 1007924 : if (!collid)
277 CBC 3 : ereport(ERROR,
278 ECB : (errcode(ERRCODE_INDETERMINATE_COLLATION),
279 : errmsg("could not determine which collation to use for string hashing"),
280 : errhint("Use the COLLATE clause to set the collation explicitly.")));
281 :
282 GIC 1007921 : if (!lc_collate_is_c(collid))
283 CBC 730129 : mylocale = pg_newlocale_from_collation(collid);
284 ECB :
285 GNC 1007921 : if (pg_locale_deterministic(mylocale))
286 ECB : {
287 GIC 1007804 : result = hash_any((unsigned char *) VARDATA_ANY(key),
288 CBC 1007804 : VARSIZE_ANY_EXHDR(key));
289 ECB : }
290 : else
291 : {
292 : Size bsize, rsize;
293 : char *buf;
294 GNC 117 : const char *keydata = VARDATA_ANY(key);
295 117 : size_t keylen = VARSIZE_ANY_EXHDR(key);
296 ECB :
297 :
298 GNC 117 : bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
299 117 : buf = palloc(bsize + 1);
300 :
301 117 : rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
302 117 : if (rsize != bsize)
303 UNC 0 : elog(ERROR, "pg_strnxfrm() returned unexpected result");
304 :
305 : /*
306 : * In principle, there's no reason to include the terminating NUL
307 : * character in the hash, but it was done before and the behavior
308 : * must be preserved.
309 : */
310 GNC 117 : result = hash_any((uint8_t *) buf, bsize + 1);
311 :
312 117 : pfree(buf);
313 ECB : }
314 :
315 : /* Avoid leaking memory for toasted inputs */
316 GIC 1007921 : PG_FREE_IF_COPY(key, 0);
317 :
318 1007921 : return result;
319 ECB : }
320 :
321 : Datum
322 CBC 2034 : hashtextextended(PG_FUNCTION_ARGS)
323 ECB : {
324 GIC 2034 : text *key = PG_GETARG_TEXT_PP(0);
325 2034 : Oid collid = PG_GET_COLLATION();
326 CBC 2034 : pg_locale_t mylocale = 0;
327 EUB : Datum result;
328 :
329 GIC 2034 : if (!collid)
330 UIC 0 : ereport(ERROR,
331 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
332 ECB : errmsg("could not determine which collation to use for string hashing"),
333 : errhint("Use the COLLATE clause to set the collation explicitly.")));
334 :
335 CBC 2034 : if (!lc_collate_is_c(collid))
336 GIC 2034 : mylocale = pg_newlocale_from_collation(collid);
337 ECB :
338 GNC 2034 : if (pg_locale_deterministic(mylocale))
339 ECB : {
340 GIC 2022 : result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
341 2022 : VARSIZE_ANY_EXHDR(key),
342 2022 : PG_GETARG_INT64(1));
343 : }
344 : else
345 ECB : {
346 : Size bsize, rsize;
347 : char *buf;
348 GNC 12 : const char *keydata = VARDATA_ANY(key);
349 12 : size_t keylen = VARSIZE_ANY_EXHDR(key);
350 EUB :
351 GNC 12 : bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
352 12 : buf = palloc(bsize + 1);
353 :
354 12 : rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
355 12 : if (rsize != bsize)
356 UNC 0 : elog(ERROR, "pg_strnxfrm() returned unexpected result");
357 :
358 : /*
359 : * In principle, there's no reason to include the terminating NUL
360 : * character in the hash, but it was done before and the behavior
361 : * must be preserved.
362 : */
363 GNC 12 : result = hash_any_extended((uint8_t *) buf, bsize + 1,
364 12 : PG_GETARG_INT64(1));
365 :
366 12 : pfree(buf);
367 : }
368 :
369 GIC 2034 : PG_FREE_IF_COPY(key, 0);
370 :
371 2034 : return result;
372 ECB : }
373 :
374 : /*
375 : * hashvarlena() can be used for any varlena datatype in which there are
376 : * no non-significant bits, ie, distinct bitpatterns never compare as equal.
377 : */
378 : Datum
379 GIC 3069 : hashvarlena(PG_FUNCTION_ARGS)
380 : {
381 CBC 3069 : struct varlena *key = PG_GETARG_VARLENA_PP(0);
382 : Datum result;
383 ECB :
384 GIC 3069 : result = hash_any((unsigned char *) VARDATA_ANY(key),
385 3069 : VARSIZE_ANY_EXHDR(key));
386 :
387 EUB : /* Avoid leaking memory for toasted inputs */
388 GIC 3069 : PG_FREE_IF_COPY(key, 0);
389 EUB :
390 GIC 3069 : return result;
391 : }
392 EUB :
393 : Datum
394 UBC 0 : hashvarlenaextended(PG_FUNCTION_ARGS)
395 : {
396 0 : struct varlena *key = PG_GETARG_VARLENA_PP(0);
397 : Datum result;
398 EUB :
399 UIC 0 : result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
400 0 : VARSIZE_ANY_EXHDR(key),
401 0 : PG_GETARG_INT64(1));
402 :
403 0 : PG_FREE_IF_COPY(key, 0);
404 :
405 0 : return result;
406 : }
|