FD.io VPP  v21.10.1-2-g0a485f517
Vector Packet Processing
vector_avx512.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
18 
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
21 
22 /* *INDENT-OFF* */
23 #define foreach_avx512_vec512i \
24  _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26  _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28  _(f,32,8,ps) _(f,64,4,pd)
29 
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31  is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
36 \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_aligned (void *p) \
39 { return (t##s##x##c) _mm512_load_si512 (p); } \
40 \
41 static_always_inline void \
42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \
43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \
44 \
45 static_always_inline t##s##x##c \
46 t##s##x##c##_load_unaligned (void *p) \
47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
48 \
49 static_always_inline void \
50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
52 \
53 static_always_inline int \
54 t##s##x##c##_is_all_zero (t##s##x##c v) \
55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
56 \
57 static_always_inline int \
58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
59 { return t##s##x##c##_is_all_zero (a ^ b); } \
60 \
61 static_always_inline int \
62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
64 \
65 static_always_inline u##c \
66 t##s##x##c##_is_zero_mask (t##s##x##c v) \
67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
68 \
69 static_always_inline t##s##x##c \
70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
72 \
73 static_always_inline t##s##x##c \
74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
76 
77 
79 #undef _
80 /* *INDENT-ON* */
81 
84 {
85  return (u32) _mm512_movepi16_mask ((__m512i) v);
86 }
87 
88 /* 512-bit packs */
89 #define _(f, t, fn) \
90  always_inline t t##_pack (f lo, f hi) \
91  { \
92  return (t) fn ((__m512i) lo, (__m512i) hi); \
93  }
94 
95 _ (i16x32, i8x64, _mm512_packs_epi16)
96 _ (i16x32, u8x64, _mm512_packus_epi16)
97 _ (i32x16, i16x32, _mm512_packs_epi32)
98 _ (i32x16, u16x32, _mm512_packus_epi32)
99 #undef _
100 
103 {
104  u8x64 swap = {
105  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
106  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
107  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
108  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
109  };
110  return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
111 }
112 
115 {
116  u8x64 swap = {
117  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
118  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
119  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
120  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
121  };
122  return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
123 }
124 
125 #define _(f, t) \
126  static_always_inline t f##_extract_lo (f v) \
127  { \
128  return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
129  } \
130  static_always_inline t f##_extract_hi (f v) \
131  { \
132  return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
133  }
134 
135 _ (u64x8, u64x4)
136 _ (u32x16, u32x8)
137 _ (u16x32, u16x16)
138 _ (u8x64, u8x32)
139 #undef _
140 
143 {
144  return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
145  u32x16_extract_hi (v)));
146 }
147 
149 u32x16_insert_lo (u32x16 r, u32x8 v)
150 {
151  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
152 }
153 
155 u32x16_insert_hi (u32x16 r, u32x8 v)
156 {
157  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
158 }
159 
162 {
163  return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
164  (__m512i) b);
165 }
166 
167 
168 #define u32x16_ternary_logic(a, b, c, d) \
169  (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
170 
171 #define u8x64_insert_u8x16(a, b, n) \
172  (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
173 
174 #define u8x64_extract_u8x16(a, n) \
175  (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
176 
177 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
178 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
179 
182 {
183  return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
184  (__m512i) c, 0x96);
185 }
186 
189 {
190  static const u8x64 mask = {
191  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
192  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
193  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
194  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
195  };
196  return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
197 }
198 
201 {
202  return (u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m);
203 }
204 
205 #define u8x64_align_right(a, b, imm) \
206  (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
207 
209 u32x16_sum_elts (u32x16 sum16)
210 {
211  u32x8 sum8;
212  sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
213  sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
214  sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
215  return sum8[0] + sum8[4];
216 }
217 
218 #define _(t, m, p, i, e) \
219  static_always_inline t t##_mask_load (t a, void *p, m mask) \
220  { \
221  return (t) p##_mask_loadu_##e ((i) a, mask, p); \
222  } \
223  static_always_inline t t##_mask_load_zero (void *p, m mask) \
224  { \
225  return (t) p##_maskz_loadu_##e (mask, p); \
226  } \
227  static_always_inline void t##_mask_store (t a, void *p, m mask) \
228  { \
229  p##_mask_storeu_##e (p, mask, (i) a); \
230  }
231 
232 _ (u8x64, u64, _mm512, __m512i, epi8)
233 _ (u8x32, u32, _mm256, __m256i, epi8)
234 _ (u8x16, u16, _mm, __m128i, epi8)
235 _ (u16x32, u32, _mm512, __m512i, epi16)
236 _ (u16x16, u16, _mm256, __m256i, epi16)
237 _ (u16x8, u8, _mm, __m128i, epi16)
238 _ (u32x16, u16, _mm512, __m512i, epi32)
239 _ (u32x8, u8, _mm256, __m256i, epi32)
240 _ (u32x4, u8, _mm, __m128i, epi32)
241 _ (u64x8, u8, _mm512, __m512i, epi64)
242 _ (u64x4, u8, _mm256, __m256i, epi64)
243 _ (u64x2, u8, _mm, __m128i, epi64)
244 #undef _
245 
246 #ifdef CLIB_HAVE_VEC512
247 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
248 #endif
249 #ifdef CLIB_HAVE_VEC256
250 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
251 #endif
252 #ifdef CLIB_HAVE_VEC128
253 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
254 #endif
255 
258 {
259  return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
260 }
261 
264 {
265  return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
266 }
267 
269 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
270 {
271  return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
272 }
273 
276 {
277  return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
278 }
279 
280 #define _(t, m, e, p, it) \
281  static_always_inline m t##_is_equal_mask (t a, t b) \
282  { \
283  return p##_cmpeq_##e##_mask ((it) a, (it) b); \
284  }
285 _ (u8x16, u16, epu8, _mm, __m128i)
286 _ (u16x8, u8, epu16, _mm, __m128i)
287 _ (u32x4, u8, epu32, _mm, __m128i)
288 _ (u64x2, u8, epu64, _mm, __m128i)
289 
290 _ (u8x32, u32, epu8, _mm256, __m256i)
291 _ (u16x16, u16, epu16, _mm256, __m256i)
292 _ (u32x8, u8, epu32, _mm256, __m256i)
293 _ (u64x4, u8, epu64, _mm256, __m256i)
294 
295 _ (u8x64, u64, epu8, _mm512, __m512i)
296 _ (u16x32, u32, epu16, _mm512, __m512i)
297 _ (u32x16, u16, epu32, _mm512, __m512i)
298 _ (u64x8, u8, epu64, _mm512, __m512i)
299 #undef _
300 
301 #define _(f, t, fn, it) \
302  static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
303 _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
304 _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
305 _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
306 _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
307 #undef _
308 
309 #define _(vt, mt, p, it, epi) \
310  static_always_inline vt vt##_compress (vt a, mt mask) \
311  { \
312  return (vt) p##_maskz_compress_##epi (mask, (it) a); \
313  } \
314  static_always_inline vt vt##_expand (vt a, mt mask) \
315  { \
316  return (vt) p##_maskz_expand_##epi (mask, (it) a); \
317  } \
318  static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
319  { \
320  p##_mask_compressstoreu_##epi (p, mask, (it) v); \
321  }
322 
323 _ (u64x8, u8, _mm512, __m512i, epi64)
324 _ (u32x16, u16, _mm512, __m512i, epi32)
325 _ (u64x4, u8, _mm256, __m256i, epi64)
326 _ (u32x8, u8, _mm256, __m256i, epi32)
327 _ (u64x2, u8, _mm, __m128i, epi64)
328 _ (u32x4, u8, _mm, __m128i, epi32)
329 #ifdef __AVX512VBMI2__
330 _ (u16x32, u32, _mm512, __m512i, epi16)
331 _ (u8x64, u64, _mm512, __m512i, epi8)
332 _ (u16x16, u16, _mm256, __m256i, epi16)
333 _ (u8x32, u32, _mm256, __m256i, epi8)
334 _ (u16x8, u8, _mm, __m128i, epi16)
335 _ (u8x16, u16, _mm, __m128i, epi8)
336 #endif
337 #undef _
338 
339 #ifdef CLIB_HAVE_VEC256
340 #define CLIB_HAVE_VEC256_COMPRESS
341 #ifdef __AVX512VBMI2__
342 #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
343 #endif
344 
345 #endif
346 #ifdef CLIB_HAVE_VEC512
347 #define CLIB_HAVE_VEC512_COMPRESS
348 #ifdef __AVX512VBMI2__
349 #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
350 #endif
351 
352 #endif
353 
354 #ifndef __AVX512VBMI2__
357 {
358  return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
359 }
360 
363 {
364  return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
365 }
366 #endif
367 
369 u32x16_transpose (u32x16 m[16])
370 {
371  __m512i r[16], a, b, c, d, x, y;
372 
373  /* *INDENT-OFF* */
374  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
375  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
376  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
377  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
378  /* *INDENT-ON* */
379 
380  r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
381  r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
382  r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
383  r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
384  r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
385  r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
386  r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
387  r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
388 
389  r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
390  r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
391  r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
392  r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
393  r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
394  r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
395  r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
396  r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
397 
398  a = _mm512_unpacklo_epi64 (r[0], r[1]);
399  b = _mm512_unpacklo_epi64 (r[2], r[3]);
400  c = _mm512_unpacklo_epi64 (r[4], r[5]);
401  d = _mm512_unpacklo_epi64 (r[6], r[7]);
402  x = _mm512_permutex2var_epi64 (a, pm1, b);
403  y = _mm512_permutex2var_epi64 (c, pm1, d);
404  m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
405  m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
406  x = _mm512_permutex2var_epi64 (a, pm2, b);
407  y = _mm512_permutex2var_epi64 (c, pm2, d);
408  m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
409  m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
410 
411  a = _mm512_unpacklo_epi64 (r[8], r[9]);
412  b = _mm512_unpacklo_epi64 (r[10], r[11]);
413  c = _mm512_unpacklo_epi64 (r[12], r[13]);
414  d = _mm512_unpacklo_epi64 (r[14], r[15]);
415  x = _mm512_permutex2var_epi64 (a, pm1, b);
416  y = _mm512_permutex2var_epi64 (c, pm1, d);
417  m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
418  m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
419  x = _mm512_permutex2var_epi64 (a, pm2, b);
420  y = _mm512_permutex2var_epi64 (c, pm2, d);
421  m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
422  m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
423 
424  a = _mm512_unpackhi_epi64 (r[0], r[1]);
425  b = _mm512_unpackhi_epi64 (r[2], r[3]);
426  c = _mm512_unpackhi_epi64 (r[4], r[5]);
427  d = _mm512_unpackhi_epi64 (r[6], r[7]);
428  x = _mm512_permutex2var_epi64 (a, pm1, b);
429  y = _mm512_permutex2var_epi64 (c, pm1, d);
430  m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
431  m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
432  x = _mm512_permutex2var_epi64 (a, pm2, b);
433  y = _mm512_permutex2var_epi64 (c, pm2, d);
434  m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
435  m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
436 
437  a = _mm512_unpackhi_epi64 (r[8], r[9]);
438  b = _mm512_unpackhi_epi64 (r[10], r[11]);
439  c = _mm512_unpackhi_epi64 (r[12], r[13]);
440  d = _mm512_unpackhi_epi64 (r[14], r[15]);
441  x = _mm512_permutex2var_epi64 (a, pm1, b);
442  y = _mm512_permutex2var_epi64 (c, pm1, d);
443  m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
444  m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
445  x = _mm512_permutex2var_epi64 (a, pm2, b);
446  y = _mm512_permutex2var_epi64 (c, pm2, d);
447  m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
448  m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
449 }
450 
451 
452 
455 {
456  __m512i r[8], x, y;
457 
458  /* *INDENT-OFF* */
459  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
460  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
461  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
462  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
463  /* *INDENT-ON* */
464 
465  r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
466  r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
467  r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
468  r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
469  r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
470  r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
471  r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
472  r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
473 
474  x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
475  y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
476  m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
477  m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
478  x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
479  y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
480  m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
481  m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
482 
483  x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
484  y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
485  m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
486  m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
487  x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
488  y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
489  m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
490  m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
491 }
492 
493 #endif /* included_vector_avx512_h */
494 /*
495  * fd.io coding-style-patch-verification: ON
496  *
497  * Local Variables:
498  * eval: (c-set-style "gnu")
499  * End:
500  */
u8x64_reflect_u8x16
static_always_inline u8x64 u8x64_reflect_u8x16(u8x64 x)
Definition: vector_avx512.h:188
foreach_avx512_vec512i
#define foreach_avx512_vec512i
Definition: vector_avx512.h:23
u16x32_msb_mask
foreach_avx512_vec512i foreach_avx512_vec512u static_always_inline u32 u16x32_msb_mask(u16x32 v)
Definition: vector_avx512.h:83
u16x16_compress
epi32 epi32 epi32 static_always_inline u16x16 u16x16_compress(u16x16 v, u16 mask)
Definition: vector_avx512.h:356
u16x32_byte_swap
static_always_inline u16x32 u16x32_byte_swap(u16x32 v)
Definition: vector_avx512.h:114
u8x64_shuffle
static_always_inline u8x64 u8x64_shuffle(u8x64 v, u8x64 m)
Definition: vector_avx512.h:200
u32x8_min
static_always_inline u32x8 u32x8_min(u32x8 a, u32x8 b)
Definition: vector_avx2.h:330
clib.h
u32x16_splat_u32x4
static_always_inline u32x16 u32x16_splat_u32x4(u32x4 a)
Definition: vector_avx512.h:263
u32x16_sum_elts
static_always_inline u32 u32x16_sum_elts(u32x16 sum16)
Definition: vector_avx512.h:209
u16
unsigned short u16
Definition: types.h:57
u16x16
u16x16
Definition: vector_avx512.h:304
u64x2
epu8_epi32 epu16_epi32 u64x2
Definition: vector_sse42.h:444
r
vnet_hw_if_output_node_runtime_t * r
Definition: interface_output.c:1089
foreach_avx512_vec512u
#define foreach_avx512_vec512u
Definition: vector_avx512.h:25
u16x32
_mm512_packus_epi16 u16x32
Definition: vector_avx512.h:98
u8x16
u8x16
Definition: vector_sse42.h:157
u32x16_transpose
static_always_inline void u32x16_transpose(u32x16 m[16])
Definition: vector_avx512.h:369
u8x64_splat_u8x16
epi8 epi16 epi16 epi32 epi64 epi64 static_always_inline u8x64 u8x64_splat_u8x16(u8x16 a)
Definition: vector_avx512.h:257
u64x8_permute
static_always_inline u64x8 u64x8_permute(u64x8 a, u64x8 b, u64x8 mask)
Definition: vector_avx512.h:161
u64x8_transpose
static_always_inline void u64x8_transpose(u64x8 m[8])
Definition: vector_avx512.h:454
epu16
epu16
Definition: vector_avx512.h:286
c
svmdb_client_t * c
Definition: vpp_get_metrics.c:48
static_always_inline
#define static_always_inline
Definition: clib.h:112
u32x16_min_scalar
u32x8 u8x32 static_always_inline u32 u32x16_min_scalar(u32x16 v)
Definition: vector_avx512.h:142
u32x16_insert_hi
static_always_inline u32x16 u32x16_insert_hi(u32x16 r, u32x8 v)
Definition: vector_avx512.h:155
mask
vl_api_pnat_mask_t mask
Definition: pnat.api:45
u32x16_insert_lo
static_always_inline u32x16 u32x16_insert_lo(u32x16 r, u32x8 v)
Definition: vector_avx512.h:149
u32x16_mask_blend
static_always_inline u32x16 u32x16_mask_blend(u32x16 a, u32x16 b, u16 mask)
Definition: vector_avx512.h:269
u8x64_mask_blend
static_always_inline u8x64 u8x64_mask_blend(u8x64 a, u8x64 b, u64 mask)
Definition: vector_avx512.h:275
u8x64_align_right
#define u8x64_align_right(a, b, imm)
Definition: vector_avx512.h:205
u16x8
_mm_packus_epi16 u16x8
Definition: vector_sse42.h:159
u32x16_byte_swap
_mm512_packus_epi16 _mm512_packus_epi32 static_always_inline u32x16 u32x16_byte_swap(u32x16 v)
Definition: vector_avx512.h:102
u64
unsigned long u64
Definition: types.h:89
u8x32
u8x32
Definition: vector_avx2.h:116
u32
unsigned int u32
Definition: types.h:88
u8x64_xor3
static_always_inline u8x64 u8x64_xor3(u8x64 a, u8x64 b, u8x64 c)
Definition: vector_avx512.h:181
u32x4
unsigned long long u32x4
Definition: ixge.c:28
epu64
__m128i epu64
Definition: vector_avx512.h:288
b
vlib_buffer_t ** b
Definition: nat44_ei_out2in.c:717
u8
unsigned char u8
Definition: types.h:56
a
a
Definition: bitmap.h:525
u64x8
__m512i u64x8
Definition: vector_avx512.h:306
u8x64
u8x64
Definition: vector_avx512.h:96
u32x8_min_scalar
static_always_inline u32 u32x8_min_scalar(u32x8 v)
Definition: vector_avx2.h:336
u16x8_compress
static_always_inline u16x8 u16x8_compress(u16x8 v, u8 mask)
Definition: vector_avx512.h:362
u64x4
u64x4
Definition: vector_avx2.h:142