FD.io VPP  v20.05.1-6-gf53edbc3b
Vector Packet Processing
vector_avx512.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
18 
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
21 
22 /* *INDENT-OFF* */
23 #define foreach_avx512_vec512i \
24  _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26  _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28  _(f,32,8,ps) _(f,64,4,pd)
29 
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31  is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
36 \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_aligned (void *p) \
39 { return (t##s##x##c) _mm512_load_si512 (p); } \
40 \
41 static_always_inline void \
42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \
43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \
44 \
45 static_always_inline t##s##x##c \
46 t##s##x##c##_load_unaligned (void *p) \
47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
48 \
49 static_always_inline void \
50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
52 \
53 static_always_inline int \
54 t##s##x##c##_is_all_zero (t##s##x##c v) \
55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
56 \
57 static_always_inline int \
58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
59 { return t##s##x##c##_is_all_zero (a ^ b); } \
60 \
61 static_always_inline int \
62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
64 \
65 static_always_inline u##c \
66 t##s##x##c##_is_zero_mask (t##s##x##c v) \
67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
68 \
69 static_always_inline t##s##x##c \
70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
72 \
73 static_always_inline t##s##x##c \
74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
76 
77 
79 #undef _
80 /* *INDENT-ON* */
81 
83 u16x32_msb_mask (u16x32 v)
84 {
85  return (u32) _mm512_movepi16_mask ((__m512i) v);
86 }
87 
89 u32x16_byte_swap (u32x16 v)
90 {
91  u8x64 swap = {
92  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
93  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
94  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
95  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
96  };
97  return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
98 }
99 
102 {
103  u8x64 swap = {
104  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
105  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
106  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
107  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
108  };
109  return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
110 }
111 
114 {
115  return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
116 }
117 
120 {
121  return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
122 }
123 
126 {
127  return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
128 }
129 
132 {
133  return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
134 }
135 
138 {
140  u32x16_extract_hi (v)));
141 }
142 
144 u32x16_insert_lo (u32x16 r, u32x8 v)
145 {
146  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
147 }
148 
150 u32x16_insert_hi (u32x16 r, u32x8 v)
151 {
152  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
153 }
154 
156 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
157 {
158  return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
159  (__m512i) b);
160 }
161 
162 
163 #define u32x16_ternary_logic(a, b, c, d) \
164  (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
165 
166 #define u8x64_insert_u8x16(a, b, n) \
167  (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
168 
169 #define u8x64_extract_u8x16(a, n) \
170  (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
171 
172 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
173 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
174 
176 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
177 {
178  return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
179  (__m512i) c, 0x96);
180 }
181 
184 {
185  static const u8x64 mask = {
186  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
187  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
188  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
189  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
190  };
191  return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
192 }
193 
195 u8x64_mask_load (u8x64 a, void *p, u64 mask)
196 {
197  return (u8x64) _mm512_mask_loadu_epi8 ((__m512i) a, mask, p);
198 }
199 
201 u8x64_mask_store (u8x64 a, void *p, u64 mask)
202 {
203  _mm512_mask_storeu_epi8 (p, mask, (__m512i) a);
204 }
205 
208 {
209  return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
210 }
211 
214 {
215  return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
216 }
217 
219 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
220 {
221  return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
222 }
223 
225 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
226 {
227  return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
228 }
229 
231 u32x16_transpose (u32x16 m[16])
232 {
233  __m512i r[16], a, b, c, d, x, y;
234 
235  /* *INDENT-OFF* */
236  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
237  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
238  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
239  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
240  /* *INDENT-ON* */
241 
242  r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
243  r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
244  r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
245  r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
246  r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
247  r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
248  r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
249  r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
250 
251  r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
252  r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
253  r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
254  r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
255  r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
256  r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
257  r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
258  r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
259 
260  a = _mm512_unpacklo_epi64 (r[0], r[1]);
261  b = _mm512_unpacklo_epi64 (r[2], r[3]);
262  c = _mm512_unpacklo_epi64 (r[4], r[5]);
263  d = _mm512_unpacklo_epi64 (r[6], r[7]);
264  x = _mm512_permutex2var_epi64 (a, pm1, b);
265  y = _mm512_permutex2var_epi64 (c, pm1, d);
266  m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
267  m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
268  x = _mm512_permutex2var_epi64 (a, pm2, b);
269  y = _mm512_permutex2var_epi64 (c, pm2, d);
270  m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
271  m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
272 
273  a = _mm512_unpacklo_epi64 (r[8], r[9]);
274  b = _mm512_unpacklo_epi64 (r[10], r[11]);
275  c = _mm512_unpacklo_epi64 (r[12], r[13]);
276  d = _mm512_unpacklo_epi64 (r[14], r[15]);
277  x = _mm512_permutex2var_epi64 (a, pm1, b);
278  y = _mm512_permutex2var_epi64 (c, pm1, d);
279  m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
280  m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
281  x = _mm512_permutex2var_epi64 (a, pm2, b);
282  y = _mm512_permutex2var_epi64 (c, pm2, d);
283  m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
284  m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
285 
286  a = _mm512_unpackhi_epi64 (r[0], r[1]);
287  b = _mm512_unpackhi_epi64 (r[2], r[3]);
288  c = _mm512_unpackhi_epi64 (r[4], r[5]);
289  d = _mm512_unpackhi_epi64 (r[6], r[7]);
290  x = _mm512_permutex2var_epi64 (a, pm1, b);
291  y = _mm512_permutex2var_epi64 (c, pm1, d);
292  m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
293  m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
294  x = _mm512_permutex2var_epi64 (a, pm2, b);
295  y = _mm512_permutex2var_epi64 (c, pm2, d);
296  m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
297  m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
298 
299  a = _mm512_unpackhi_epi64 (r[8], r[9]);
300  b = _mm512_unpackhi_epi64 (r[10], r[11]);
301  c = _mm512_unpackhi_epi64 (r[12], r[13]);
302  d = _mm512_unpackhi_epi64 (r[14], r[15]);
303  x = _mm512_permutex2var_epi64 (a, pm1, b);
304  y = _mm512_permutex2var_epi64 (c, pm1, d);
305  m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
306  m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
307  x = _mm512_permutex2var_epi64 (a, pm2, b);
308  y = _mm512_permutex2var_epi64 (c, pm2, d);
309  m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
310  m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
311 }
312 
313 
314 
316 u64x8_transpose (u64x8 m[8])
317 {
318  __m512i r[8], x, y;
319 
320  /* *INDENT-OFF* */
321  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
322  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
323  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
324  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
325  /* *INDENT-ON* */
326 
327  r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
328  r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
329  r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
330  r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
331  r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
332  r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
333  r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
334  r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
335 
336  x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
337  y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
338  m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
339  m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
340  x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
341  y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
342  m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
343  m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
344 
345  x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
346  y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
347  m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
348  m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
349  x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
350  y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
351  m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
352  m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
353 }
354 
355 #endif /* included_vector_avx512_h */
356 /*
357  * fd.io coding-style-patch-verification: ON
358  *
359  * Local Variables:
360  * eval: (c-set-style "gnu")
361  * End:
362  */
static_always_inline u64x8 u64x8_permute(u64x8 a, u64x8 b, u64x8 mask)
a
Definition: bitmap.h:538
static_always_inline void u64x8_transpose(u64x8 m[8])
unsigned long u64
Definition: types.h:89
static_always_inline u8x64 u8x64_reflect_u8x16(u8x64 x)
static_always_inline u8x64 u8x64_mask_load(u8x64 a, void *p, u64 mask)
static_always_inline u32 u32x8_min_scalar(u32x8 v)
Definition: vector_avx2.h:298
static_always_inline void u32x16_transpose(u32x16 m[16])
static_always_inline u32x8 u32x16_extract_hi(u32x16 v)
static_always_inline u8x32 u8x64_extract_hi(u8x64 v)
static_always_inline u32x16 u32x16_insert_lo(u32x16 r, u32x8 v)
static_always_inline u8x64 u8x64_splat_u8x16(u8x16 a)
static_always_inline u32x16 u32x16_splat_u32x4(u32x4 a)
#define static_always_inline
Definition: clib.h:106
unsigned int u32
Definition: types.h:88
#define foreach_avx512_vec512i
Definition: vector_avx512.h:23
static_always_inline u8x64 u8x64_xor3(u8x64 a, u8x64 b, u8x64 c)
static_always_inline u16x32 u16x32_byte_swap(u16x32 v)
static_always_inline u32x16 u32x16_byte_swap(u32x16 v)
Definition: vector_avx512.h:89
unsigned short u16
Definition: types.h:57
static_always_inline u32x8 u32x16_extract_lo(u32x16 v)
static_always_inline u32x16 u32x16_mask_blend(u32x16 a, u32x16 b, u16 mask)
svmdb_client_t * c
static_always_inline u8x32 u8x64_extract_lo(u8x64 v)
static_always_inline u32 u32x16_min_scalar(u32x16 v)
static_always_inline u32x8 u32x8_min(u32x8 a, u32x8 b)
Definition: vector_avx2.h:292
foreach_avx512_vec512i foreach_avx512_vec512u static_always_inline u32 u16x32_msb_mask(u16x32 v)
Definition: vector_avx512.h:83
static_always_inline u32x16 u32x16_insert_hi(u32x16 r, u32x8 v)
static_always_inline u8x64 u8x64_mask_blend(u8x64 a, u8x64 b, u64 mask)
static_always_inline void u8x64_mask_store(u8x64 a, void *p, u64 mask)
unsigned long long u32x4
Definition: ixge.c:28
#define foreach_avx512_vec512u
Definition: vector_avx512.h:25