FD.io VPP  v16.06
Vector Packet Processing
vector_sse2.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* 128 bit interleaves. */
45 always_inline u8x16 u8x16_interleave_hi (u8x16 a, u8x16 b)
46 { return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b); }
47 
48 always_inline u8x16 u8x16_interleave_lo (u8x16 a, u8x16 b)
49 { return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b); }
50 
51 always_inline u16x8 u16x8_interleave_hi (u16x8 a, u16x8 b)
52 { return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b); }
53 
54 always_inline u16x8 u16x8_interleave_lo (u16x8 a, u16x8 b)
55 { return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b); }
56 
57 always_inline u32x4 u32x4_interleave_hi (u32x4 a, u32x4 b)
58 { return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b); }
59 
60 always_inline u32x4 u32x4_interleave_lo (u32x4 a, u32x4 b)
61 { return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b); }
62 
63 always_inline u64x2 u64x2_interleave_hi (u64x2 a, u64x2 b)
64 { return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b); }
65 
66 always_inline u64x2 u64x2_interleave_lo (u64x2 a, u64x2 b)
67 { return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b); }
68 
69 /* 64 bit interleaves. */
70 always_inline u8x8 u8x8_interleave_hi (u8x8 a, u8x8 b)
71 { return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b); }
72 
73 always_inline u8x8 u8x8_interleave_lo (u8x8 a, u8x8 b)
74 { return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b); }
75 
76 always_inline u16x4 u16x4_interleave_hi (u16x4 a, u16x4 b)
77 { return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b); }
78 
79 always_inline u16x4 u16x4_interleave_lo (u16x4 a, u16x4 b)
80 { return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b); }
81 
82 always_inline u32x2 u32x2_interleave_hi (u32x2 a, u32x2 b)
83 { return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b); }
84 
85 always_inline u32x2 u32x2_interleave_lo (u32x2 a, u32x2 b)
86 { return (u32x2) _m_punpckldq ((__m64) a, (__m64) b); }
87 
88 /* 128 bit packs. */
89 always_inline u8x16 u16x8_pack (u16x8 lo, u16x8 hi)
90 { return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi); }
91 
93 { return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi); }
94 
95 always_inline u16x8 u32x4_pack (u32x4 lo, u32x4 hi)
96 { return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi); }
97 
98 /* 64 bit packs. */
99 always_inline u8x8 u16x4_pack (u16x4 lo, u16x4 hi)
100 { return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi); }
101 
102 always_inline i8x8 i16x4_pack (i16x4 lo, i16x4 hi)
103 { return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi); }
104 
105 always_inline u16x4 u32x2_pack (u32x2 lo, u32x2 hi)
106 { return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi); }
107 
108 always_inline i16x4 i32x2_pack (i32x2 lo, i32x2 hi)
109 { return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi); }
110 
111 /* Splats: replicate scalar value into vector. */
113 {
114  u64x2 x = {a};
115  x = u64x2_interleave_lo (x, x);
116  return x;
117 }
118 
120 {
121  u32x4 x = {a};
122  x = u32x4_interleave_lo (x, x);
123  x = (u32x4) u64x2_interleave_lo ((u64x2) x, (u64x2) x);
124  return x;
125 }
126 
128 {
129  u32 t = (u32) a | ((u32) a << 16);
130  return (u16x8) u32x4_splat (t);
131 }
132 
134 {
135  u32 t = (u32) a | ((u32) a << 8);
136  t |= t << 16;
137  return (u8x16) u16x8_splat (t);
138 }
139 
141 {
142  u32x2 x = {a};
143  x = u32x2_interleave_lo (x, x);
144  return x;
145  }
146 
148 {
149  u32 t = (u32) a | ((u32) a << 16);
150  return (u16x4) u32x2_splat (t);
151 }
152 
154 {
155  u32 t = (u32) a | ((u32) a << 8);
156  t |= t << 16;
157  return (u8x8) u32x2_splat (t);
158 }
159 
160 #define i64x2_splat u64x2_splat
161 #define i32x4_splat u32x4_splat
162 #define i16x8_splat u16x8_splat
163 #define i8x16_splat u8x16_splat
164 #define i32x2_splat u32x2_splat
165 #define i16x4_splat u16x4_splat
166 #define i8x8_splat u8x8_splat
167 
168 #ifndef __ICC
169 always_inline u64x2 u64x2_read_lo (u64x2 x, u64 * a)
170 { return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a); }
171 
172 always_inline u64x2 u64x2_read_hi (u64x2 x, u64 * a)
173 { return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a); }
174 
176 { _mm_storel_pi ((__m64 *) a, (__m128) x); }
177 
179 { _mm_storeh_pi ((__m64 *) a, (__m128) x); }
180 #endif
181 
182 /* Unaligned loads/stores. */
183 
184 #define _(t) \
185  always_inline void t##_store_unaligned (t x, t * a) \
186  { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
187  always_inline t t##_load_unaligned (t * a) \
188  { return (t) _mm_loadu_si128 ((__m128i *) a); }
189 
190 _ (u8x16)
191 _ (u16x8)
192 _ (u32x4)
193 _ (u64x2)
194 _ (i8x16)
195 _ (i16x8)
196 _ (i32x4)
197 _ (i64x2)
198 
199 #undef _
200 
201 #define _signed_binop(n,m,f,g) \
202  /* Unsigned */ \
203  always_inline u##n##x##m \
204  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
205  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
206  \
207  /* Signed */ \
208  always_inline i##n##x##m \
209  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
210  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
211 
212 /* Addition/subtraction. */
213 _signed_binop (8, 16, add, add_epi)
214 _signed_binop (16, 8, add, add_epi)
215 _signed_binop (32, 4, add, add_epi)
216 _signed_binop (64, 2, add, add_epi)
217 _signed_binop (8, 16, sub, sub_epi)
218 _signed_binop (16, 8, sub, sub_epi)
219 _signed_binop (32, 4, sub, sub_epi)
220 _signed_binop (64, 2, sub, sub_epi)
221 
222 /* Addition/subtraction with saturation. */
223 
224 _signed_binop (8, 16, add_saturate, adds_epu)
225 _signed_binop (16, 8, add_saturate, adds_epu)
226 _signed_binop (8, 16, sub_saturate, subs_epu)
227 _signed_binop (16, 8, sub_saturate, subs_epu)
228 
229 /* Multiplication. */
230 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
231 { return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y); }
232 
233 always_inline u16x8 u16x8_mul_lo (u16x8 x, u16x8 y)
234 { return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y); }
235 
237 { return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y); }
238 
239 always_inline u16x8 u16x8_mul_hi (u16x8 x, u16x8 y)
240 { return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y); }
241 
242 /* 128 bit shifts. */
243 
244 #define _(p,a,b,c,f) \
245  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
246  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
247  \
248  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
249  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
250 
251  _ (u, 16, 8, left, sll)
252  _ (u, 32, 4, left, sll)
253  _ (u, 64, 2, left, sll)
254  _ (u, 16, 8, right, srl)
255  _ (u, 32, 4, right, srl)
256  _ (u, 64, 2, right, srl)
257  _ (i, 16, 8, left, sll)
258  _ (i, 32, 4, left, sll)
259  _ (i, 64, 2, left, sll)
260  _ (i, 16, 8, right, sra)
261  _ (i, 32, 4, right, sra)
262 
263 #undef _
264 
265 /* 64 bit shifts. */
266 always_inline u16x4 u16x4_shift_left (u16x4 x, u16x4 i)
267 { return (u16x4) _m_psllw ((__m64) x, (__m64) i); };
268 
269 always_inline u32x2 u32x2_shift_left (u32x2 x, u32x2 i)
270 { return (u32x2) _m_pslld ((__m64) x, (__m64) i); };
271 
272 always_inline u16x4 u16x4_shift_right (u16x4 x, u16x4 i)
273 { return (u16x4) _m_psrlw ((__m64) x, (__m64) i); };
274 
275 always_inline u32x2 u32x2_shift_right (u32x2 x, u32x2 i)
276 { return (u32x2) _m_psrld ((__m64) x, (__m64) i); };
277 
278 always_inline i16x4 i16x4_shift_left (i16x4 x, i16x4 i)
279 { return (i16x4) _m_psllw ((__m64) x, (__m64) i); };
280 
281 always_inline i32x2 i32x2_shift_left (i32x2 x, i32x2 i)
282 { return (i32x2) _m_pslld ((__m64) x, (__m64) i); };
283 
284 always_inline i16x4 i16x4_shift_right (i16x4 x, i16x4 i)
285 { return (i16x4) _m_psraw ((__m64) x, (__m64) i); };
286 
287 always_inline i32x2 i32x2_shift_right (i32x2 x, i32x2 i)
288 { return (i32x2) _m_psrad ((__m64) x, (__m64) i); };
289 
290 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
291 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
292 
293 #define i8x16_word_shift_left(a,n) \
294  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
295 #define i8x16_word_shift_right(a,n) \
296  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
297 
298 #define u16x8_word_shift_left(a,n) \
299  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
300 #define i16x8_word_shift_left(a,n) \
301  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
302 #define u16x8_word_shift_right(a,n) \
303  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
304 #define i16x8_word_shift_right(a,n) \
305  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
306 
307 #define u32x4_word_shift_left(a,n) \
308  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
309 #define i32x4_word_shift_left(a,n) \
310  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
311 #define u32x4_word_shift_right(a,n) \
312  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
313 #define i32x4_word_shift_right(a,n) \
314  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
315 
316 #define u64x2_word_shift_left(a,n) \
317  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
318 #define i64x2_word_shift_left(a,n) \
319  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
320 #define u64x2_word_shift_right(a,n) \
321  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
322 #define i64x2_word_shift_right(a,n) \
323  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
324 
325 /* SSE2 has no rotate instructions: use shifts to simulate them. */
326 #define _(t,n,lr1,lr2) \
327  always_inline t##x##n \
328  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
329  { \
330  ASSERT (i >= 0 && i <= BITS (t)); \
331  return (t##x##n##_ishift_##lr1 (w, i) \
332  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
333  } \
334  \
335  always_inline t##x##n \
336  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
337  { \
338  t##x##n j = t##x##n##_splat (BITS (t)); \
339  return (t##x##n##_shift_##lr1 (w, i) \
340  | t##x##n##_shift_##lr2 (w, j - i)); \
341  }
342 
343 _ (u16, 8, left, right);
344 _ (u16, 8, right, left);
345 _ (u32, 4, left, right);
346 _ (u32, 4, right, left);
347 _ (u64, 2, left, right);
348 _ (u64, 2, right, left);
349 
350 #undef _
351 
352 #ifndef __clang__
353 #define _(t,n,lr1,lr2) \
354  always_inline t##x##n \
355  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
356  { \
357  int m = sizeof (t##x##n) / sizeof (t); \
358  ASSERT (i >= 0 && i < m); \
359  return (t##x##n##_word_shift_##lr1 (w0, i) \
360  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
361  } \
362  \
363  always_inline t##x##n \
364  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
365  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
366 
367 _ (u8, 16, left, right);
368 _ (u8, 16, right, left);
369 _ (u16, 8, left, right);
370 _ (u16, 8, right, left);
371 _ (u32, 4, left, right);
372 _ (u32, 4, right, left);
373 _ (u64, 2, left, right);
374 _ (u64, 2, right, left);
375 
376 #undef _
377 #endif
378 
379 /* Compare operations. */
380 always_inline u8x16 u8x16_is_equal (u8x16 x, u8x16 y)
381 { return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y); }
382 
383 always_inline i8x16 i8x16_is_equal (i8x16 x, i8x16 y)
384 { return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y); }
385 
386 always_inline u16x8 u16x8_is_equal (u16x8 x, u16x8 y)
387 { return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y); }
388 
390 { return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y); }
391 
392 always_inline u32x4 u32x4_is_equal (u32x4 x, u32x4 y)
393 { return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y); }
394 
396 { return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y); }
397 
398 always_inline u8x16
399 i8x16_is_greater (i8x16 x, i8x16 y)
400 { return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y); }
401 
402 always_inline u16x8
404 { return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y); }
405 
406 always_inline u32x4
408 { return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y); }
409 
411 {
412  u8x16 zero = {0};
413  return u8x16_is_equal (x, zero);
414 }
415 
417 {
418  u16x8 zero = {0};
419  return u16x8_is_equal (x, zero);
420 }
421 
423 {
424  u32x4 zero = {0};
425  return u32x4_is_equal (x, zero);
426 }
427 
428 #define u32x4_select(A,MASK) \
429 ({ \
430  u32x4 _x, _y; \
431  _x = (A); \
432  asm volatile ("pshufd %[mask], %[x], %[y]" \
433  : /* outputs */ [y] "=x" (_y) \
434  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
435  _y; \
436 })
437 
438 #define u32x4_splat_word(x,i) \
439  u32x4_select ((x), (((i) << (2*0)) \
440  | ((i) << (2*1)) \
441  | ((i) << (2*2)) \
442  | ((i) << (2*3))))
443 
444 /* Extract low order 32 bit word. */
446 u32x4_get0 (u32x4 x)
447 {
448  u32 result;
449  asm volatile ("movd %[x], %[result]"
450  : /* outputs */ [result] "=r" (result)
451  : /* inputs */ [x] "x" (x));
452  return result;
453 }
454 
455 always_inline u32x4
457 {
458  u32x4 result;
459  asm volatile ("movd %[x], %[result]"
460  : /* outputs */ [result] "=x" (result)
461  : /* inputs */ [x] "r" (x));
462  return result;
463 }
464 
467 { return (i32x4) u32x4_set0 ((u32) x); }
468 
471 { return (i32) u32x4_get0 ((u32x4) x); }
472 
473 /* Converts all ones/zeros compare mask to bitmap. */
475 { return _mm_movemask_epi8 ((__m128i) x); }
476 
478 
480 {
481  u32 m = u8x16_compare_byte_mask ((u8x16) x);
482  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
483  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
484 }
485 
487 {
488  u8x16 zero = {0};
489  return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
490 }
491 
493 {
494  u16x8 zero = {0};
495  return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
496 }
497 
499 {
500  u32x4 zero = {0};
501  return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
502 }
503 
504 always_inline u8x16 u8x16_max (u8x16 x, u8x16 y)
505 { return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y); }
506 
508 {
509  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
510  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
511  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
512  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
513  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
514 }
515 
516 always_inline u8x16 u8x16_min (u8x16 x, u8x16 y)
517 { return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y); }
518 
520 {
521  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
522  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
523  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
524  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
525  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
526 }
527 
529 { return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y); }
530 
532 {
533  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
534  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
535  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
536  return _mm_extract_epi16 ((__m128i) x, 0);
537 }
538 
540 { return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y); }
541 
543 {
544  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
545  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
546  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
547  return _mm_extract_epi16 ((__m128i) x, 0);
548 }
549 
550 #undef _signed_binop
551 
552 #endif /* included_vector_sse2_h */
always_inline i32x4 i32x4_set0(i32 x)
Definition: vector_sse2.h:466
always_inline u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse2.h:63
always_inline u8x16 u8x16_is_equal(u8x16 x, u8x16 y)
Definition: vector_sse2.h:380
vmrglw vmrglh hi
#define i16x8_word_shift_right(a, n)
Definition: vector_sse2.h:304
always_inline u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse2.h:498
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:267
#define u8x16_word_shift_right(a, n)
Definition: vector_sse2.h:291
always_inline u8x16 u8x16_min(u8x16 x, u8x16 y)
Definition: vector_sse2.h:516
a
Definition: bitmap.h:393
always_inline u8x16 i8x16_is_greater(i8x16 x, i8x16 y)
Definition: vector_sse2.h:399
always_inline u32x4 u32x4_is_equal(u32x4 x, u32x4 y)
Definition: vector_sse2.h:392
always_inline i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse2.h:281
sll right
Definition: vector_sse2.h:254
add_epi add_epi sub
Definition: vector_sse2.h:217
always_inline u16x8 u16x8_splat(u16 a)
Definition: vector_sse2.h:127
always_inline u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse2.h:269
always_inline u16x8 u16x8_is_zero(u16x8 x)
Definition: vector_sse2.h:416
always_inline u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse2.h:85
add_epi add_epi sub_epi sub_epi adds_epu subs_epu i16x8 y
Definition: vector_sse2.h:231
u8 u32x4_compare_word_mask_table[256]
add_epi add_epi sub_epi sub_epi add_saturate
Definition: vector_sse2.h:224
always_inline u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse2.h:507
always_inline u8x16 u8x16_splat(u8 a)
Definition: vector_sse2.h:133
i32x4
#define always_inline
Definition: clib.h:84
always_inline u32x4 u32x4_splat(u32 a)
Definition: vector_sse2.h:119
int i32
Definition: types.h:81
always_inline i16x8 i16x8_min(i16x8 x, i16x8 y)
Definition: vector_sse2.h:539
always_inline u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse2.h:51
always_inline void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:178
unsigned long u64
Definition: types.h:89
always_inline u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse2.h:73
always_inline u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse2.h:54
left
Definition: vector_sse2.h:252
always_inline u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse2.h:76
always_inline u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse2.h:45
add
Definition: vector_sse2.h:213
always_inline u32x4 u32x4_set0(u32 x)
Definition: vector_sse2.h:456
always_inline u8x8 u8x8_splat(u8 a)
Definition: vector_sse2.h:153
always_inline u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse2.h:233
always_inline u8x16 u8x16_max(u8x16 x, u8x16 y)
Definition: vector_sse2.h:504
always_inline u32x2 u32x2_splat(u32 a)
Definition: vector_sse2.h:140
always_inline u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse2.h:486
always_inline i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse2.h:236
always_inline void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:175
always_inline u32x4 i32x4_is_greater(i32x4 x, i32x4 y)
Definition: vector_sse2.h:407
always_inline u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse2.h:519
always_inline u8x16 u8x16_is_zero(u8x16 x)
Definition: vector_sse2.h:410
always_inline u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse2.h:57
always_inline u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse2.h:272
always_inline u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse2.h:239
always_inline u64x2 u64x2_splat(u64 a)
Definition: vector_sse2.h:112
always_inline u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse2.h:89
always_inline u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse2.h:105
always_inline u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:172
add_epi add_epi sub_epi sub_epi adds_epu sub_saturate
Definition: vector_sse2.h:226
always_inline u16x4 u16x4_splat(u16 a)
Definition: vector_sse2.h:147
unsigned int u32
Definition: types.h:88
always_inline u16x8 u16x8_is_equal(u16x8 x, u16x8 y)
Definition: vector_sse2.h:386
always_inline u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse2.h:479
always_inline u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse2.h:70
vmrglw i16x8
always_inline i8x16 i8x16_is_equal(i8x16 x, i8x16 y)
Definition: vector_sse2.h:383
always_inline i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse2.h:102
always_inline u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse2.h:66
always_inline i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse2.h:542
always_inline u16x8 i16x8_is_greater(i16x8 x, i16x8 y)
Definition: vector_sse2.h:403
always_inline u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:169
always_inline u32x4 u32x4_is_zero(u32x4 x)
Definition: vector_sse2.h:422
unsigned short u16
Definition: types.h:57
always_inline i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse2.h:287
always_inline u32 u32x4_get0(u32x4 x)
Definition: vector_sse2.h:446
always_inline i16x8 i16x8_is_equal(i16x8 x, i16x8 y)
Definition: vector_sse2.h:389
always_inline u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse2.h:492
unsigned char u8
Definition: types.h:56
always_inline u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse2.h:99
always_inline i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse2.h:108
always_inline i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse2.h:278
always_inline u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse2.h:60
always_inline i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse2.h:92
short i16
Definition: types.h:46
always_inline u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse2.h:48
always_inline u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse2.h:474
always_inline u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse2.h:82
always_inline u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse2.h:95
always_inline i32x4 i32x4_is_equal(i32x4 x, i32x4 y)
Definition: vector_sse2.h:395
always_inline i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse2.h:284
always_inline u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse2.h:275
always_inline i32 i32x4_get0(i32x4 x)
Definition: vector_sse2.h:470
always_inline u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse2.h:79
always_inline i16x8 i16x8_max(i16x8 x, i16x8 y)
Definition: vector_sse2.h:528
always_inline i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse2.h:531