FD.io VPP  v18.04-17-g3a0d853
Vector Packet Processing
vector_sse42.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* 128 bit interleaves. */
45 always_inline u8x16
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
47 {
48  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49 }
50 
51 always_inline u8x16
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
53 {
54  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55 }
56 
57 always_inline u16x8
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
59 {
60  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61 }
62 
63 always_inline u16x8
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
65 {
66  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67 }
68 
71 {
72  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73 }
74 
77 {
78  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79 }
80 
81 always_inline u64x2
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
83 {
84  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85 }
86 
87 always_inline u64x2
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
89 {
90  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91 }
92 
93 /* 64 bit interleaves. */
94 always_inline u8x8
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
96 {
97  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98 }
99 
100 always_inline u8x8
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
102 {
103  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104 }
105 
106 always_inline u16x4
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
108 {
109  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110 }
111 
112 always_inline u16x4
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
114 {
115  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116 }
117 
118 always_inline u32x2
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
120 {
121  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122 }
123 
124 always_inline u32x2
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
126 {
127  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128 }
129 
130 /* 128 bit packs. */
131 always_inline u8x16
132 u16x8_pack (u16x8 lo, u16x8 hi)
133 {
134  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135 }
136 
137 always_inline i8x16
139 {
140  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141 }
142 
143 always_inline u16x8
145 {
146  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147 }
148 
149 /* 64 bit packs. */
150 always_inline u8x8
151 u16x4_pack (u16x4 lo, u16x4 hi)
152 {
153  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154 }
155 
156 always_inline i8x8
157 i16x4_pack (i16x4 lo, i16x4 hi)
158 {
159  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160 }
161 
162 always_inline u16x4
163 u32x2_pack (u32x2 lo, u32x2 hi)
164 {
165  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166 }
167 
168 always_inline i16x4
169 i32x2_pack (i32x2 lo, i32x2 hi)
170 {
171  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172 }
173 
174 #ifndef __ICC
175 always_inline u64x2
176 u64x2_read_lo (u64x2 x, u64 * a)
177 {
178  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
179 }
180 
181 always_inline u64x2
182 u64x2_read_hi (u64x2 x, u64 * a)
183 {
184  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
185 }
186 
187 always_inline void
188 u64x2_write_lo (u64x2 x, u64 * a)
189 {
190  _mm_storel_pi ((__m64 *) a, (__m128) x);
191 }
192 
193 always_inline void
194 u64x2_write_hi (u64x2 x, u64 * a)
195 {
196  _mm_storeh_pi ((__m64 *) a, (__m128) x);
197 }
198 #endif
199 
200 /* Unaligned loads/stores. */
201 
202 #define _(t) \
203  always_inline void t##_store_unaligned (t x, t * a) \
204  { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
205  always_inline t t##_load_unaligned (t * a) \
206  { return (t) _mm_loadu_si128 ((__m128i *) a); }
207 
208 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
209 #undef _
210 #define _signed_binop(n,m,f,g) \
211  /* Unsigned */ \
212  always_inline u##n##x##m \
213  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
214  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
215  \
216  /* Signed */ \
217  always_inline i##n##x##m \
218  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
219  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
220 /* Addition/subtraction with saturation. */
221  _signed_binop (8, 16, add_saturate, adds_epu)
222 _signed_binop (16, 8, add_saturate, adds_epu)
223 _signed_binop (8, 16, sub_saturate, subs_epu)
224 _signed_binop (16, 8, sub_saturate, subs_epu)
225 /* Multiplication. */
226  always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
227 {
228  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
229 }
230 
231 always_inline u16x8
232 u16x8_mul_lo (u16x8 x, u16x8 y)
233 {
234  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
235 }
236 
239 {
240  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
241 }
242 
243 always_inline u16x8
244 u16x8_mul_hi (u16x8 x, u16x8 y)
245 {
246  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
247 }
248 
249 /* 128 bit shifts. */
250 
251 #define _(p,a,b,c,f) \
252  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
253  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
254  \
255  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
256  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
257 
258 _(u, 16, 8, left, sll)
259 _(u, 32, 4, left, sll)
260 _(u, 64, 2, left, sll)
261 _(u, 16, 8, right, srl)
262 _(u, 32, 4, right, srl)
263 _(u, 64, 2, right, srl)
264 _(i, 16, 8, left, sll)
265 _(i, 32, 4, left, sll)
266 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
267 #undef _
268 /* 64 bit shifts. */
269  always_inline u16x4
270 u16x4_shift_left (u16x4 x, u16x4 i)
271 {
272  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
273 };
274 
275 always_inline u32x2
276 u32x2_shift_left (u32x2 x, u32x2 i)
277 {
278  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
279 };
280 
281 always_inline u16x4
282 u16x4_shift_right (u16x4 x, u16x4 i)
283 {
284  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
285 };
286 
287 always_inline u32x2
288 u32x2_shift_right (u32x2 x, u32x2 i)
289 {
290  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
291 };
292 
293 always_inline i16x4
294 i16x4_shift_left (i16x4 x, i16x4 i)
295 {
296  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
297 };
298 
299 always_inline i32x2
300 i32x2_shift_left (i32x2 x, i32x2 i)
301 {
302  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
303 };
304 
305 always_inline i16x4
306 i16x4_shift_right (i16x4 x, i16x4 i)
307 {
308  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
309 };
310 
311 always_inline i32x2
312 i32x2_shift_right (i32x2 x, i32x2 i)
313 {
314  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
315 };
316 
317 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
318 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
319 
320 #define i8x16_word_shift_left(a,n) \
321  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
322 #define i8x16_word_shift_right(a,n) \
323  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
324 
325 #define u16x8_word_shift_left(a,n) \
326  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
327 #define i16x8_word_shift_left(a,n) \
328  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
329 #define u16x8_word_shift_right(a,n) \
330  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
331 #define i16x8_word_shift_right(a,n) \
332  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
333 
334 #define u32x4_word_shift_left(a,n) \
335  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
336 #define i32x4_word_shift_left(a,n) \
337  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
338 #define u32x4_word_shift_right(a,n) \
339  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
340 #define i32x4_word_shift_right(a,n) \
341  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
342 
343 #define u64x2_word_shift_left(a,n) \
344  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
345 #define i64x2_word_shift_left(a,n) \
346  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
347 #define u64x2_word_shift_right(a,n) \
348  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
349 #define i64x2_word_shift_right(a,n) \
350  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
351 
352 /* SSE2 has no rotate instructions: use shifts to simulate them. */
353 #define _(t,n,lr1,lr2) \
354  always_inline t##x##n \
355  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
356  { \
357  ASSERT (i >= 0 && i <= BITS (t)); \
358  return (t##x##n##_ishift_##lr1 (w, i) \
359  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
360  } \
361  \
362  always_inline t##x##n \
363  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
364  { \
365  t##x##n j = t##x##n##_splat (BITS (t)); \
366  return (t##x##n##_shift_##lr1 (w, i) \
367  | t##x##n##_shift_##lr2 (w, j - i)); \
368  }
369 
370 _(u16, 8, left, right);
371 _(u16, 8, right, left);
372 _(u32, 4, left, right);
373 _(u32, 4, right, left);
374 _(u64, 2, left, right);
375 _(u64, 2, right, left);
376 
377 #undef _
378 
379 #ifndef __clang__
380 #define _(t,n,lr1,lr2) \
381  always_inline t##x##n \
382  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
383  { \
384  int m = sizeof (t##x##n) / sizeof (t); \
385  ASSERT (i >= 0 && i < m); \
386  return (t##x##n##_word_shift_##lr1 (w0, i) \
387  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
388  } \
389  \
390  always_inline t##x##n \
391  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
392  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
393 
394 _(u8, 16, left, right);
395 _(u8, 16, right, left);
396 _(u16, 8, left, right);
397 _(u16, 8, right, left);
398 _(u32, 4, left, right);
399 _(u32, 4, right, left);
400 _(u64, 2, left, right);
401 _(u64, 2, right, left);
402 
403 #undef _
404 #endif
405 
406 always_inline int
408 {
409  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
410 }
411 
412 always_inline int
414 {
415  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
416 }
417 
418 always_inline int
420 {
421  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
422 }
423 
424 always_inline int
426 {
427  return _mm_testz_si128 ((__m128i) x, (__m128i) x);
428 }
429 
430 #define u32x4_select(A,MASK) \
431 ({ \
432  u32x4 _x, _y; \
433  _x = (A); \
434  asm volatile ("pshufd %[mask], %[x], %[y]" \
435  : /* outputs */ [y] "=x" (_y) \
436  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
437  _y; \
438 })
439 
440 #define u32x4_splat_word(x,i) \
441  u32x4_select ((x), (((i) << (2*0)) \
442  | ((i) << (2*1)) \
443  | ((i) << (2*2)) \
444  | ((i) << (2*3))))
445 
446 /* Extract low order 32 bit word. */
449 {
450  u32 result;
451  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
452  : /* inputs */ [x] "x" (x));
453  return result;
454 }
455 
458 {
459  u32x4 result;
460  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
461  : /* inputs */ [x] "r" (x));
462  return result;
463 }
464 
467 {
468  return (i32x4) u32x4_set0 ((u32) x);
469 }
470 
473 {
474  return (i32) u32x4_get0 ((u32x4) x);
475 }
476 
477 /* Converts all ones/zeros compare mask to bitmap. */
480 {
481  return _mm_movemask_epi8 ((__m128i) x);
482 }
483 
485 
488 {
489  u32 m = u8x16_compare_byte_mask ((u8x16) x);
490  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
491  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
492 }
493 
496 {
497  u8x16 zero = { 0 };
498  return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
499 }
500 
503 {
504  u16x8 zero = { 0 };
505  return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
506 }
507 
510 {
511  u32x4 zero = { 0 };
512  return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
513 }
514 
515 always_inline u8x16
516 u8x16_max (u8x16 x, u8x16 y)
517 {
518  return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
519 }
520 
523 {
524  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
525  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
526  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
527  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
528  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
529 }
530 
531 always_inline u8x16
532 u8x16_min (u8x16 x, u8x16 y)
533 {
534  return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
535 }
536 
539 {
540  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
541  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
542  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
543  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
544  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
545 }
546 
549 {
550  return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
551 }
552 
555 {
556  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
557  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
558  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
559  return _mm_extract_epi16 ((__m128i) x, 0);
560 }
561 
564 {
565  return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
566 }
567 
570 {
571  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
572  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
573  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
574  return _mm_extract_epi16 ((__m128i) x, 0);
575 }
576 
577 #undef _signed_binop
578 
579 #endif /* included_vector_sse2_h */
580 
581 /*
582  * fd.io coding-style-patch-verification: ON
583  *
584  * Local Variables:
585  * eval: (c-set-style "gnu")
586  * End:
587  */
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse42.h:119
#define u8x16_word_shift_right(a, n)
Definition: vector_sse42.h:318
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse42.h:82
vmrglw vmrglh hi
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse42.h:76
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse42.h:232
sll right
Definition: vector_sse42.h:261
adds_epu subs_epu i16x8 y
Definition: vector_sse42.h:227
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse42.h:554
a
Definition: bitmap.h:516
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse42.h:88
static int u8x16_is_all_zero(u8x16 x)
Definition: vector_sse42.h:407
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse42.h:151
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse42.h:132
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse42.h:107
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse42.h:138
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse42.h:95
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse42.h:306
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse42.h:238
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse42.h:58
adds_epu sub_saturate
Definition: vector_sse42.h:223
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:194
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse42.h:113
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse42.h:502
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse42.h:64
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse42.h:125
i32x4
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse42.h:144
static i16x8 i16x8_min(i16x8 x, i16x8 y)
Definition: vector_sse42.h:563
#define always_inline
Definition: clib.h:92
unsigned long long u32x4
Definition: ixge.c:28
int i32
Definition: types.h:81
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse42.h:300
static int u16x8_is_all_zero(u16x8 x)
Definition: vector_sse42.h:413
unsigned long u64
Definition: types.h:89
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse42.h:448
static int u64x2_is_all_zero(u64x2 x)
Definition: vector_sse42.h:425
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse42.h:466
static u8x16 u8x16_max(u8x16 x, u8x16 y)
Definition: vector_sse42.h:516
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse42.h:46
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse42.h:569
lo
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse42.h:538
static int u32x4_is_all_zero(u32x4 x)
Definition: vector_sse42.h:419
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse42.h:276
#define i16x8_word_shift_right(a, n)
Definition: vector_sse42.h:331
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse42.h:495
add_saturate
Definition: vector_sse42.h:221
static i16x8 i16x8_max(i16x8 x, i16x8 y)
Definition: vector_sse42.h:548
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:271
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse42.h:472
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse42.h:70
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse42.h:479
u8 u32x4_compare_word_mask_table[256]
unsigned int u32
Definition: types.h:88
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse42.h:294
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse42.h:157
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse42.h:522
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse42.h:288
vmrglw i16x8
left
Definition: vector_sse42.h:259
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse42.h:169
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse42.h:163
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse42.h:282
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse42.h:101
static u8x16 u8x16_min(u8x16 x, u8x16 y)
Definition: vector_sse42.h:532
unsigned short u16
Definition: types.h:57
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse42.h:487
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse42.h:52
unsigned char u8
Definition: types.h:56
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:182
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse42.h:244
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse42.h:312
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:188
short i16
Definition: types.h:46
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse42.h:457
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:176
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse42.h:509