Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * simd.h
4 : * Support for platform-specific vector operations.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * src/include/port/simd.h
10 : *
11 : * NOTES
12 : * - VectorN in this file refers to a register where the element operands
13 : * are N bits wide. The vector width is platform-specific, so users that care
14 : * about that will need to inspect "sizeof(VectorN)".
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #ifndef SIMD_H
19 : #define SIMD_H
20 :
21 : #if (defined(__x86_64__) || defined(_M_AMD64))
22 : /*
23 : * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
24 : * that compilers targeting this architecture understand SSE2 intrinsics.
25 : *
26 : * We use emmintrin.h rather than the comprehensive header immintrin.h in
27 : * order to exclude extensions beyond SSE2. This is because MSVC, at least,
28 : * will allow the use of intrinsics that haven't been enabled at compile
29 : * time.
30 : */
31 : #include <emmintrin.h>
32 : #define USE_SSE2
33 : typedef __m128i Vector8;
34 : typedef __m128i Vector32;
35 :
36 : #elif defined(__aarch64__) && defined(__ARM_NEON)
37 : /*
38 : * We use the Neon instructions if the compiler provides access to them (as
39 : * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40 : * technically optional for aarch64, it appears that all available 64-bit
41 : * hardware does have it. Neon exists in some 32-bit hardware too, but we
42 : * could not realistically use it there without a run-time check, which seems
43 : * not worth the trouble for now.
44 : */
45 : #include <arm_neon.h>
46 : #define USE_NEON
47 : typedef uint8x16_t Vector8;
48 : typedef uint32x4_t Vector32;
49 :
50 : #else
51 : /*
52 : * If no SIMD instructions are available, we can in some cases emulate vector
53 : * operations using bitwise operations on unsigned integers. Note that many
54 : * of the functions in this file presently do not have non-SIMD
55 : * implementations. In particular, none of the functions involving Vector32
56 : * are implemented without SIMD since it's likely not worthwhile to represent
57 : * two 32-bit integers using a uint64.
58 : */
59 : #define USE_NO_SIMD
60 : typedef uint64 Vector8;
61 : #endif
62 :
63 : /* load/store operations */
64 : static inline void vector8_load(Vector8 *v, const uint8 *s);
65 : #ifndef USE_NO_SIMD
66 : static inline void vector32_load(Vector32 *v, const uint32 *s);
67 : #endif
68 :
69 : /* assignment operations */
70 : static inline Vector8 vector8_broadcast(const uint8 c);
71 : #ifndef USE_NO_SIMD
72 : static inline Vector32 vector32_broadcast(const uint32 c);
73 : #endif
74 :
75 : /* element-wise comparisons to a scalar */
76 : static inline bool vector8_has(const Vector8 v, const uint8 c);
77 : static inline bool vector8_has_zero(const Vector8 v);
78 : static inline bool vector8_has_le(const Vector8 v, const uint8 c);
79 : static inline bool vector8_is_highbit_set(const Vector8 v);
80 : #ifndef USE_NO_SIMD
81 : static inline bool vector32_is_highbit_set(const Vector32 v);
82 : #endif
83 :
84 : /* arithmetic operations */
85 : static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
86 : #ifndef USE_NO_SIMD
87 : static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
88 : static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
89 : #endif
90 :
91 : /*
92 : * comparisons between vectors
93 : *
94 : * Note: These return a vector rather than boolean, which is why we don't
95 : * have non-SIMD implementations.
96 : */
97 : #ifndef USE_NO_SIMD
98 : static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
99 : static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
100 : #endif
101 :
102 : /*
103 : * Load a chunk of memory into the given vector.
104 : */
105 : static inline void
232 john.naylor 106 GNC 10101586 : vector8_load(Vector8 *v, const uint8 *s)
107 : {
108 : #if defined(USE_SSE2)
109 10101586 : *v = _mm_loadu_si128((const __m128i *) s);
110 : #elif defined(USE_NEON)
111 : *v = vld1q_u8(s);
112 : #else
113 : memcpy(v, s, sizeof(Vector8));
114 : #endif
115 10101586 : }
116 :
117 : #ifndef USE_NO_SIMD
118 : static inline void
223 119 120 : vector32_load(Vector32 *v, const uint32 *s)
120 : {
121 : #ifdef USE_SSE2
122 120 : *v = _mm_loadu_si128((const __m128i *) s);
123 : #elif defined(USE_NEON)
124 : *v = vld1q_u32(s);
125 : #endif
126 120 : }
127 : #endif /* ! USE_NO_SIMD */
128 :
129 : /*
130 : * Create a vector with all elements set to the same value.
131 : */
132 : static inline Vector8
232 133 14421912 : vector8_broadcast(const uint8 c)
134 : {
135 : #if defined(USE_SSE2)
136 28843824 : return _mm_set1_epi8(c);
137 : #elif defined(USE_NEON)
138 : return vdupq_n_u8(c);
139 : #else
140 : return ~UINT64CONST(0) / 0xFF * c;
141 : #endif
142 : }
143 :
144 : #ifndef USE_NO_SIMD
145 : static inline Vector32
223 146 6372203 : vector32_broadcast(const uint32 c)
147 : {
148 : #ifdef USE_SSE2
149 12744406 : return _mm_set1_epi32(c);
150 : #elif defined(USE_NEON)
151 : return vdupq_n_u32(c);
152 : #endif
153 : }
154 : #endif /* ! USE_NO_SIMD */
155 :
156 : /*
157 : * Return true if any elements in the vector are equal to the given scalar.
158 : */
159 : static inline bool
232 160 1782762 : vector8_has(const Vector8 v, const uint8 c)
161 : {
162 : bool result;
163 :
164 : /* pre-compute the result for assert checking */
165 : #ifdef USE_ASSERT_CHECKING
166 1782762 : bool assert_result = false;
167 :
217 168 24439235 : for (Size i = 0; i < sizeof(Vector8); i++)
169 : {
232 170 23306666 : if (((const uint8 *) &v)[i] == c)
171 : {
172 650193 : assert_result = true;
173 650193 : break;
174 : }
175 : }
176 : #endif /* USE_ASSERT_CHECKING */
177 :
178 : #if defined(USE_NO_SIMD)
179 : /* any bytes in v equal to c will evaluate to zero via XOR */
180 : result = vector8_has_zero(v ^ vector8_broadcast(c));
181 : #else
223 182 1782762 : result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
183 : #endif
184 :
232 185 1782762 : Assert(assert_result == result);
186 1782762 : return result;
187 : }
188 :
189 : /*
190 : * Convenience function equivalent to vector8_has(v, 0)
191 : */
192 : static inline bool
193 160914 : vector8_has_zero(const Vector8 v)
194 : {
195 : #if defined(USE_NO_SIMD)
196 : /*
197 : * We cannot call vector8_has() here, because that would lead to a
198 : * circular definition.
199 : */
200 : return vector8_has_le(v, 0);
201 : #else
202 160914 : return vector8_has(v, 0);
203 : #endif
204 : }
205 :
206 : /*
207 : * Return true if any elements in the vector are less than or equal to the
208 : * given scalar.
209 : */
210 : static inline bool
211 160914 : vector8_has_le(const Vector8 v, const uint8 c)
212 : {
213 160914 : bool result = false;
214 :
215 : /* pre-compute the result for assert checking */
216 : #ifdef USE_ASSERT_CHECKING
217 160914 : bool assert_result = false;
218 :
217 219 2735525 : for (Size i = 0; i < sizeof(Vector8); i++)
220 : {
232 221 2574624 : if (((const uint8 *) &v)[i] <= c)
222 : {
223 13 : assert_result = true;
224 13 : break;
225 : }
226 : }
227 : #endif /* USE_ASSERT_CHECKING */
228 :
229 : #if defined(USE_NO_SIMD)
230 :
231 : /*
232 : * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
233 : * but it only works if c+1 <= 128 and if the highest bit in v is not set.
234 : * Adapted from
235 : * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
236 : */
237 : if ((int64) v >= 0 && c < 0x80)
238 : result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
239 : else
240 : {
241 : /* one byte at a time */
242 : for (Size i = 0; i < sizeof(Vector8); i++)
243 : {
244 : if (((const uint8 *) &v)[i] <= c)
245 : {
246 : result = true;
247 : break;
248 : }
249 : }
250 : }
251 : #else
252 :
253 : /*
254 : * Use saturating subtraction to find bytes <= c, which will present as
255 : * NUL bytes. This approach is a workaround for the lack of unsigned
256 : * comparison instructions on some architectures.
257 : */
223 258 160914 : result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
259 : #endif
260 :
232 261 160914 : Assert(assert_result == result);
262 160914 : return result;
263 : }
264 :
265 : /*
266 : * Return true if the high bit of any element is set
267 : */
268 : static inline bool
226 269 5942204 : vector8_is_highbit_set(const Vector8 v)
270 : {
271 : #ifdef USE_SSE2
272 5942204 : return _mm_movemask_epi8(v) != 0;
273 : #elif defined(USE_NEON)
274 : return vmaxvq_u8(v) > 0x7F;
275 : #else
276 : return v & vector8_broadcast(0x80);
277 : #endif
278 : }
279 :
280 : /*
281 : * Exactly like vector8_is_highbit_set except for the input type, so it
282 : * looks at each byte separately.
283 : *
284 : * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
285 : * integer elements, but Arm does not, hence the need for a separate
286 : * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
287 : * check each 32-bit element, but that would require an additional mask
288 : * operation on x86.
289 : */
290 : #ifndef USE_NO_SIMD
291 : static inline bool
223 292 30 : vector32_is_highbit_set(const Vector32 v)
293 : {
294 : #if defined(USE_NEON)
295 : return vector8_is_highbit_set((Vector8) v);
296 : #else
297 30 : return vector8_is_highbit_set(v);
298 : #endif
299 : }
300 : #endif /* ! USE_NO_SIMD */
301 :
302 : /*
303 : * Return the bitwise OR of the inputs
304 : */
305 : static inline Vector8
226 306 16637648 : vector8_or(const Vector8 v1, const Vector8 v2)
307 : {
308 : #ifdef USE_SSE2
309 16637648 : return _mm_or_si128(v1, v2);
310 : #elif defined(USE_NEON)
311 : return vorrq_u8(v1, v2);
312 : #else
313 : return v1 | v2;
314 : #endif
315 : }
316 :
317 : #ifndef USE_NO_SIMD
318 : static inline Vector32
223 319 90 : vector32_or(const Vector32 v1, const Vector32 v2)
320 : {
321 : #ifdef USE_SSE2
322 90 : return _mm_or_si128(v1, v2);
323 : #elif defined(USE_NEON)
324 : return vorrq_u32(v1, v2);
325 : #endif
326 : }
327 : #endif /* ! USE_NO_SIMD */
328 :
329 : /*
330 : * Return the result of subtracting the respective elements of the input
331 : * vectors using saturation (i.e., if the operation would yield a value less
332 : * than zero, zero is returned instead). For more information on saturation
333 : * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
334 : */
335 : #ifndef USE_NO_SIMD
336 : static inline Vector8
337 160914 : vector8_ssub(const Vector8 v1, const Vector8 v2)
338 : {
339 : #ifdef USE_SSE2
340 160914 : return _mm_subs_epu8(v1, v2);
341 : #elif defined(USE_NEON)
342 : return vqsubq_u8(v1, v2);
343 : #endif
344 : }
345 : #endif /* ! USE_NO_SIMD */
346 :
347 : /*
348 : * Return a vector with all bits set in each lane where the corresponding
349 : * lanes in the inputs are equal.
350 : */
351 : #ifndef USE_NO_SIMD
352 : static inline Vector8
226 353 10101586 : vector8_eq(const Vector8 v1, const Vector8 v2)
354 : {
355 : #ifdef USE_SSE2
356 10101586 : return _mm_cmpeq_epi8(v1, v2);
357 : #elif defined(USE_NEON)
358 : return vceqq_u8(v1, v2);
359 : #endif
360 : }
361 : #endif /* ! USE_NO_SIMD */
362 :
363 : #ifndef USE_NO_SIMD
364 : static inline Vector32
223 365 120 : vector32_eq(const Vector32 v1, const Vector32 v2)
366 : {
367 : #ifdef USE_SSE2
368 120 : return _mm_cmpeq_epi32(v1, v2);
369 : #elif defined(USE_NEON)
370 : return vceqq_u32(v1, v2);
371 : #endif
372 : }
373 : #endif /* ! USE_NO_SIMD */
374 :
375 : #endif /* SIMD_H */
|