16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_aligned (void *p) \
39 { return (t##s##x##c) _mm512_load_si512 (p); } \
41 static_always_inline void \
42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \
43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \
45 static_always_inline t##s##x##c \
46 t##s##x##c##_load_unaligned (void *p) \
47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
49 static_always_inline void \
50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
53 static_always_inline int \
54 t##s##x##c##_is_all_zero (t##s##x##c v) \
55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
57 static_always_inline int \
58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
59 { return t##s##x##c##_is_all_zero (a ^ b); } \
61 static_always_inline int \
62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
65 static_always_inline u##c \
66 t##s##x##c##_is_zero_mask (t##s##x##c v) \
67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
69 static_always_inline t##s##x##c \
70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
73 static_always_inline t##s##x##c \
74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
85 return (
u32) _mm512_movepi16_mask ((__m512i) v);
90 always_inline t t##_pack (f lo, f hi) \
92 return (t) fn ((__m512i) lo, (__m512i) hi); \
95 _ (i16x32, i8x64, _mm512_packs_epi16)
96 _ (i16x32,
u8x64, _mm512_packus_epi16)
97 _ (i32x16, i16x32, _mm512_packs_epi32)
98 _ (i32x16,
u16x32, _mm512_packus_epi32)
105 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
106 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
107 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
108 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
110 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
117 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
118 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
119 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
120 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
122 return (
u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
126 static_always_inline t f##_extract_lo (f v) \
128 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
130 static_always_inline t f##_extract_hi (f v) \
132 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
145 u32x16_extract_hi (v)));
151 return (u32x16) _mm512_inserti64x4 ((__m512i)
r, (__m256i) v, 0);
157 return (u32x16) _mm512_inserti64x4 ((__m512i)
r, (__m256i) v, 1);
163 return (
u64x8) _mm512_permutex2var_epi64 ((__m512i)
a, (__m512i)
mask,
168 #define u32x16_ternary_logic(a, b, c, d) \
169 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
171 #define u8x64_insert_u8x16(a, b, n) \
172 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
174 #define u8x64_extract_u8x16(a, n) \
175 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
177 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
178 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
183 return (
u8x64) _mm512_ternarylogic_epi32 ((__m512i)
a, (__m512i)
b,
191 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
192 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
193 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
194 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
196 return (
u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i)
mask);
202 return (
u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m);
205 #define u8x64_align_right(a, b, imm) \
206 (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
214 sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
215 return sum8[0] + sum8[4];
218 #define _(t, m, p, i, e) \
219 static_always_inline t t##_mask_load (t a, void *p, m mask) \
221 return (t) p##_mask_loadu_##e ((i) a, mask, p); \
223 static_always_inline t t##_mask_load_zero (void *p, m mask) \
225 return (t) p##_maskz_loadu_##e (mask, p); \
227 static_always_inline void t##_mask_store (t a, void *p, m mask) \
229 p##_mask_storeu_##e (p, mask, (i) a); \
232 _ (
u8x64,
u64, _mm512, __m512i, epi8)
238 _ (u32x16,
u16, _mm512, __m512i, epi32)
239 _ (u32x8,
u8, _mm256, __m256i, epi32)
240 _ (
u32x4,
u8, _mm, __m128i, epi32)
241 _ (
u64x8,
u8, _mm512, __m512i, epi64)
242 _ (
u64x4,
u8, _mm256, __m256i, epi64)
243 _ (
u64x2,
u8, _mm, __m128i, epi64)
246 #ifdef CLIB_HAVE_VEC512
247 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
249 #ifdef CLIB_HAVE_VEC256
250 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
252 #ifdef CLIB_HAVE_VEC128
253 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
259 return (
u8x64) _mm512_broadcast_i64x2 ((__m128i)
a);
265 return (u32x16) _mm512_broadcast_i64x2 ((__m128i)
a);
271 return (u32x16) _mm512_mask_blend_epi32 (
mask, (__m512i)
a, (__m512i)
b);
277 return (
u8x64) _mm512_mask_blend_epi8 (
mask, (__m512i)
a, (__m512i)
b);
280 #define _(t, m, e, p, it) \
281 static_always_inline m t##_is_equal_mask (t a, t b) \
283 return p##_cmpeq_##e##_mask ((it) a, (it) b); \
287 _ (
u32x4,
u8, epu32, _mm, __m128i)
290 _ (
u8x32,
u32, epu8, _mm256, __m256i)
292 _ (u32x8,
u8, epu32, _mm256, __m256i)
295 _ (
u8x64,
u64, epu8, _mm512, __m512i)
297 _ (u32x16,
u16, epu32, _mm512, __m512i)
301 #define _(f, t, fn, it) \
302 static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
303 _ (
u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
304 _ (u32x16,
u16x16, _mm512_cvtusepi32_epi16, __m512i)
305 _ (u32x8,
u16x8, _mm256_cvtusepi32_epi16, __m256i)
306 _ (u32x8,
u64x8, _mm512_cvtepu32_epi64, __m256i)
309 #define _(vt, mt, p, it, epi) \
310 static_always_inline vt vt##_compress (vt a, mt mask) \
312 return (vt) p##_maskz_compress_##epi (mask, (it) a); \
314 static_always_inline vt vt##_expand (vt a, mt mask) \
316 return (vt) p##_maskz_expand_##epi (mask, (it) a); \
318 static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
320 p##_mask_compressstoreu_##epi (p, mask, (it) v); \
323 _ (
u64x8,
u8, _mm512, __m512i, epi64)
324 _ (u32x16,
u16, _mm512, __m512i, epi32)
325 _ (
u64x4,
u8, _mm256, __m256i, epi64)
326 _ (u32x8,
u8, _mm256, __m256i, epi32)
327 _ (
u64x2,
u8, _mm, __m128i, epi64)
328 _ (
u32x4,
u8, _mm, __m128i, epi32)
329 #ifdef __AVX512VBMI2__
331 _ (
u8x64,
u64, _mm512, __m512i, epi8)
333 _ (
u8x32,
u32, _mm256, __m256i, epi8)
334 _ (
u16x8,
u8, _mm, __m128i, epi16)
339 #ifdef CLIB_HAVE_VEC256
340 #define CLIB_HAVE_VEC256_COMPRESS
341 #ifdef __AVX512VBMI2__
342 #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
346 #ifdef CLIB_HAVE_VEC512
347 #define CLIB_HAVE_VEC512_COMPRESS
348 #ifdef __AVX512VBMI2__
349 #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
354 #ifndef __AVX512VBMI2__
358 return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v),
mask));
364 return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v),
mask));
371 __m512i
r[16],
a,
b,
c, d, x, y;
374 __m512i pm1 = (__m512i) (
u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
375 __m512i pm2 = (__m512i) (
u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
376 __m512i pm3 = (__m512i) (
u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
377 __m512i pm4 = (__m512i) (
u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
380 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
381 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
382 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
383 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
384 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
385 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
386 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
387 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
389 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
390 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
391 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
392 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
393 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
394 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
395 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
396 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
398 a = _mm512_unpacklo_epi64 (
r[0],
r[1]);
399 b = _mm512_unpacklo_epi64 (
r[2],
r[3]);
400 c = _mm512_unpacklo_epi64 (
r[4],
r[5]);
401 d = _mm512_unpacklo_epi64 (
r[6],
r[7]);
402 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
403 y = _mm512_permutex2var_epi64 (
c, pm1, d);
404 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
405 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
406 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
407 y = _mm512_permutex2var_epi64 (
c, pm2, d);
408 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
409 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
411 a = _mm512_unpacklo_epi64 (
r[8],
r[9]);
412 b = _mm512_unpacklo_epi64 (
r[10],
r[11]);
413 c = _mm512_unpacklo_epi64 (
r[12],
r[13]);
414 d = _mm512_unpacklo_epi64 (
r[14],
r[15]);
415 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
416 y = _mm512_permutex2var_epi64 (
c, pm1, d);
417 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
418 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
419 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
420 y = _mm512_permutex2var_epi64 (
c, pm2, d);
421 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
422 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
424 a = _mm512_unpackhi_epi64 (
r[0],
r[1]);
425 b = _mm512_unpackhi_epi64 (
r[2],
r[3]);
426 c = _mm512_unpackhi_epi64 (
r[4],
r[5]);
427 d = _mm512_unpackhi_epi64 (
r[6],
r[7]);
428 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
429 y = _mm512_permutex2var_epi64 (
c, pm1, d);
430 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
431 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
432 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
433 y = _mm512_permutex2var_epi64 (
c, pm2, d);
434 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
435 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
437 a = _mm512_unpackhi_epi64 (
r[8],
r[9]);
438 b = _mm512_unpackhi_epi64 (
r[10],
r[11]);
439 c = _mm512_unpackhi_epi64 (
r[12],
r[13]);
440 d = _mm512_unpackhi_epi64 (
r[14],
r[15]);
441 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
442 y = _mm512_permutex2var_epi64 (
c, pm1, d);
443 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
444 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
445 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
446 y = _mm512_permutex2var_epi64 (
c, pm2, d);
447 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
448 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
459 __m512i pm1 = (__m512i) (
u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
460 __m512i pm2 = (__m512i) (
u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
461 __m512i pm3 = (__m512i) (
u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
462 __m512i pm4 = (__m512i) (
u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
465 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
466 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
467 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
468 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
469 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
470 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
471 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
472 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
474 x = _mm512_permutex2var_epi64 (
r[0], pm1,
r[1]);
475 y = _mm512_permutex2var_epi64 (
r[2], pm1,
r[3]);
476 m[0] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
477 m[4] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
478 x = _mm512_permutex2var_epi64 (
r[0], pm2,
r[1]);
479 y = _mm512_permutex2var_epi64 (
r[2], pm2,
r[3]);
480 m[2] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
481 m[6] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
483 x = _mm512_permutex2var_epi64 (
r[4], pm1,
r[5]);
484 y = _mm512_permutex2var_epi64 (
r[6], pm1,
r[7]);
485 m[1] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
486 m[5] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
487 x = _mm512_permutex2var_epi64 (
r[4], pm2,
r[5]);
488 y = _mm512_permutex2var_epi64 (
r[6], pm2,
r[7]);
489 m[3] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
490 m[7] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);