16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
20 #include <x86intrin.h>
23 #define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_aligned (void *p) \
39 { return (t##s##x##c) _mm512_load_si512 (p); } \
41 static_always_inline void \
42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \
43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \
45 static_always_inline t##s##x##c \
46 t##s##x##c##_load_unaligned (void *p) \
47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
49 static_always_inline void \
50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
53 static_always_inline int \
54 t##s##x##c##_is_all_zero (t##s##x##c v) \
55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
57 static_always_inline int \
58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
59 { return t##s##x##c##_is_all_zero (a ^ b); } \
61 static_always_inline int \
62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
65 static_always_inline u##c \
66 t##s##x##c##_is_zero_mask (t##s##x##c v) \
67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
69 static_always_inline t##s##x##c \
70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
73 static_always_inline t##s##x##c \
74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
85 return (
u32) _mm512_movepi16_mask ((__m512i) v);
90 always_inline t t##_pack (f lo, f hi) \
92 return (t) fn ((__m512i) lo, (__m512i) hi); \
95 _ (i16x32, i8x64, _mm512_packs_epi16)
96 _ (i16x32,
u8x64, _mm512_packus_epi16)
97 _ (i32x16, i16x32, _mm512_packs_epi32)
98 _ (i32x16,
u16x32, _mm512_packus_epi32)
105 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
106 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
107 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
108 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
110 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
117 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
118 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
119 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
120 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
122 return (
u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
126 static_always_inline t f##_extract_lo (f v) \
128 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
130 static_always_inline t f##_extract_hi (f v) \
132 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
145 u32x16_extract_hi (v)));
151 return (u32x16) _mm512_inserti64x4 ((__m512i)
r, (__m256i) v, 0);
157 return (u32x16) _mm512_inserti64x4 ((__m512i)
r, (__m256i) v, 1);
163 return (
u64x8) _mm512_permutex2var_epi64 ((__m512i)
a, (__m512i)
mask,
168 #define u32x16_ternary_logic(a, b, c, d) \
169 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
171 #define u8x64_insert_u8x16(a, b, n) \
172 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
174 #define u8x64_extract_u8x16(a, n) \
175 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
177 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
178 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
183 return (
u8x64) _mm512_ternarylogic_epi32 ((__m512i)
a, (__m512i)
b,
191 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
192 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
193 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
194 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
196 return (
u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i)
mask);
202 return (
u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m);
205 #define u8x64_align_right(a, b, imm) \
206 (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
214 sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
215 return sum8[0] + sum8[4];
218 #define _(t, m, p, i, e) \
219 static_always_inline t t##_mask_load (t a, void *p, m mask) \
221 return (t) p##_mask_loadu_##e ((i) a, mask, p); \
223 static_always_inline t t##_mask_load_zero (void *p, m mask) \
225 return (t) p##_maskz_loadu_##e (mask, p); \
227 static_always_inline void t##_mask_store (t a, void *p, m mask) \
229 p##_mask_storeu_##e (p, mask, (i) a); \
232 _ (
u8x64,
u64, _mm512, __m512i, epi8)
238 _ (u32x16,
u16, _mm512, __m512i, epi32)
239 _ (u32x8,
u8, _mm256, __m256i, epi32)
240 _ (
u32x4,
u8, _mm, __m128i, epi32)
241 _ (
u64x8,
u8, _mm512, __m512i, epi64)
242 _ (
u64x4,
u8, _mm256, __m256i, epi64)
243 _ (
u64x2,
u8, _mm, __m128i, epi64)
246 #ifdef CLIB_HAVE_VEC512
247 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
249 #ifdef CLIB_HAVE_VEC256
250 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
252 #ifdef CLIB_HAVE_VEC128
253 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
259 return (
u8x64) _mm512_broadcast_i64x2 ((__m128i)
a);
265 return (u32x16) _mm512_broadcast_i64x2 ((__m128i)
a);
271 return (u32x16) _mm512_mask_blend_epi32 (
mask, (__m512i)
a, (__m512i)
b);
277 return (
u8x64) _mm512_mask_blend_epi8 (
mask, (__m512i)
a, (__m512i)
b);
280 #define _(t, m, e, p, it) \
281 static_always_inline m t##_is_equal_mask (t a, t b) \
283 return p##_cmpeq_##e##_mask ((it) a, (it) b); \
287 _ (
u32x4,
u8, epu32, _mm, __m128i)
290 _ (
u8x32,
u32, epu8, _mm256, __m256i)
292 _ (u32x8,
u8, epu32, _mm256, __m256i)
295 _ (
u8x64,
u64, epu8, _mm512, __m512i)
297 _ (u32x16,
u16, epu32, _mm512, __m512i)
301 #define _(f, t, fn, it) \
302 static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
303 _ (
u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
304 _ (u32x16,
u16x16, _mm512_cvtusepi32_epi16, __m512i)
305 _ (u32x8,
u16x8, _mm256_cvtusepi32_epi16, __m256i)
306 _ (u32x8,
u64x8, _mm512_cvtepu32_epi64, __m256i)
309 #define _(vt, mt, p, it, epi) \
310 static_always_inline vt vt##_compress (vt a, mt mask) \
312 return (vt) p##_maskz_compress_##epi (mask, (it) a); \
314 static_always_inline vt vt##_expand (vt a, mt mask) \
316 return (vt) p##_maskz_expand_##epi (mask, (it) a); \
318 static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
320 p##_mask_compressstoreu_##epi (p, mask, (it) v); \
323 _ (
u64x8,
u8, _mm512, __m512i, epi64)
324 _ (u32x16,
u16, _mm512, __m512i, epi32)
325 _ (
u64x4,
u8, _mm256, __m256i, epi64)
326 _ (u32x8,
u8, _mm256, __m256i, epi32)
327 _ (
u64x2,
u8, _mm, __m128i, epi64)
328 _ (
u32x4,
u8, _mm, __m128i, epi32)
329 #ifdef __AVX512VBMI2__
331 _ (
u8x64,
u64, _mm512, __m512i, epi8)
333 _ (
u8x32,
u32, _mm256, __m256i, epi8)
334 _ (
u16x8,
u8, _mm, __m128i, epi16)
339 #ifdef CLIB_HAVE_VEC256
340 #define CLIB_HAVE_VEC256_COMPRESS
342 #ifdef CLIB_HAVE_VEC512
343 #define CLIB_HAVE_VEC512_COMPRESS
346 #ifndef __AVX512VBMI2__
350 return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v),
mask));
356 return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v),
mask));
363 __m512i
r[16],
a,
b,
c, d, x, y;
366 __m512i pm1 = (__m512i) (
u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
367 __m512i pm2 = (__m512i) (
u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
368 __m512i pm3 = (__m512i) (
u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
369 __m512i pm4 = (__m512i) (
u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
372 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
373 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
374 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
375 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
376 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
377 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
378 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
379 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
381 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
382 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
383 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
384 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
385 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
386 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
387 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
388 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
390 a = _mm512_unpacklo_epi64 (
r[0],
r[1]);
391 b = _mm512_unpacklo_epi64 (
r[2],
r[3]);
392 c = _mm512_unpacklo_epi64 (
r[4],
r[5]);
393 d = _mm512_unpacklo_epi64 (
r[6],
r[7]);
394 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
395 y = _mm512_permutex2var_epi64 (
c, pm1, d);
396 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
397 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
398 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
399 y = _mm512_permutex2var_epi64 (
c, pm2, d);
400 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
401 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
403 a = _mm512_unpacklo_epi64 (
r[8],
r[9]);
404 b = _mm512_unpacklo_epi64 (
r[10],
r[11]);
405 c = _mm512_unpacklo_epi64 (
r[12],
r[13]);
406 d = _mm512_unpacklo_epi64 (
r[14],
r[15]);
407 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
408 y = _mm512_permutex2var_epi64 (
c, pm1, d);
409 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
410 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
411 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
412 y = _mm512_permutex2var_epi64 (
c, pm2, d);
413 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
414 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
416 a = _mm512_unpackhi_epi64 (
r[0],
r[1]);
417 b = _mm512_unpackhi_epi64 (
r[2],
r[3]);
418 c = _mm512_unpackhi_epi64 (
r[4],
r[5]);
419 d = _mm512_unpackhi_epi64 (
r[6],
r[7]);
420 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
421 y = _mm512_permutex2var_epi64 (
c, pm1, d);
422 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
423 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
424 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
425 y = _mm512_permutex2var_epi64 (
c, pm2, d);
426 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
427 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
429 a = _mm512_unpackhi_epi64 (
r[8],
r[9]);
430 b = _mm512_unpackhi_epi64 (
r[10],
r[11]);
431 c = _mm512_unpackhi_epi64 (
r[12],
r[13]);
432 d = _mm512_unpackhi_epi64 (
r[14],
r[15]);
433 x = _mm512_permutex2var_epi64 (
a, pm1,
b);
434 y = _mm512_permutex2var_epi64 (
c, pm1, d);
435 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
436 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
437 x = _mm512_permutex2var_epi64 (
a, pm2,
b);
438 y = _mm512_permutex2var_epi64 (
c, pm2, d);
439 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
440 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
451 __m512i pm1 = (__m512i) (
u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
452 __m512i pm2 = (__m512i) (
u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
453 __m512i pm3 = (__m512i) (
u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
454 __m512i pm4 = (__m512i) (
u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
457 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
458 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
459 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
460 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
461 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
462 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
463 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
464 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
466 x = _mm512_permutex2var_epi64 (
r[0], pm1,
r[1]);
467 y = _mm512_permutex2var_epi64 (
r[2], pm1,
r[3]);
468 m[0] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
469 m[4] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
470 x = _mm512_permutex2var_epi64 (
r[0], pm2,
r[1]);
471 y = _mm512_permutex2var_epi64 (
r[2], pm2,
r[3]);
472 m[2] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
473 m[6] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
475 x = _mm512_permutex2var_epi64 (
r[4], pm1,
r[5]);
476 y = _mm512_permutex2var_epi64 (
r[6], pm1,
r[7]);
477 m[1] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
478 m[5] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
479 x = _mm512_permutex2var_epi64 (
r[4], pm2,
r[5]);
480 y = _mm512_permutex2var_epi64 (
r[6], pm2,
r[7]);
481 m[3] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
482 m[7] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);