FD.io VPP  v17.07.01-10-g3be13f0
Vector Packet Processing
vector_sse2.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* 128 bit interleaves. */
45 always_inline u8x16
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
47 {
48  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49 }
50 
51 always_inline u8x16
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
53 {
54  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55 }
56 
57 always_inline u16x8
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
59 {
60  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61 }
62 
63 always_inline u16x8
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
65 {
66  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67 }
68 
71 {
72  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73 }
74 
77 {
78  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79 }
80 
81 always_inline u64x2
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
83 {
84  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85 }
86 
87 always_inline u64x2
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
89 {
90  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91 }
92 
93 /* 64 bit interleaves. */
94 always_inline u8x8
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
96 {
97  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98 }
99 
100 always_inline u8x8
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
102 {
103  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104 }
105 
106 always_inline u16x4
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
108 {
109  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110 }
111 
112 always_inline u16x4
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
114 {
115  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116 }
117 
118 always_inline u32x2
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
120 {
121  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122 }
123 
124 always_inline u32x2
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
126 {
127  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128 }
129 
130 /* 128 bit packs. */
131 always_inline u8x16
132 u16x8_pack (u16x8 lo, u16x8 hi)
133 {
134  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135 }
136 
137 always_inline i8x16
139 {
140  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141 }
142 
143 always_inline u16x8
145 {
146  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147 }
148 
149 /* 64 bit packs. */
150 always_inline u8x8
151 u16x4_pack (u16x4 lo, u16x4 hi)
152 {
153  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154 }
155 
156 always_inline i8x8
157 i16x4_pack (i16x4 lo, i16x4 hi)
158 {
159  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160 }
161 
162 always_inline u16x4
163 u32x2_pack (u32x2 lo, u32x2 hi)
164 {
165  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166 }
167 
168 always_inline i16x4
169 i32x2_pack (i32x2 lo, i32x2 hi)
170 {
171  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172 }
173 
174 /* Splats: replicate scalar value into vector. */
175 always_inline u64x2
177 {
178  u64x2 x = { a, a };
179  return x;
180 }
181 
184 {
185  u32x4 x = { a, a, a, a };
186  return x;
187 }
188 
189 always_inline u16x8
191 {
192  u16x8 x = { a, a, a, a, a, a, a, a };
193  return x;
194 }
195 
196 always_inline u8x16
198 {
199  u8x16 x = { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a };
200  return x;
201 }
202 
203 always_inline u32x2
205 {
206  u32x2 x = { a, a };
207  return x;
208 }
209 
210 always_inline u16x4
212 {
213  u16x4 x = { a, a, a, a };
214  return x;
215 }
216 
217 always_inline u8x8
219 {
220  u8x8 x = { a, a, a, a, a, a, a, a };
221  return x;
222 }
223 
224 #define i64x2_splat u64x2_splat
225 #define i32x4_splat u32x4_splat
226 #define i16x8_splat u16x8_splat
227 #define i8x16_splat u8x16_splat
228 #define i32x2_splat u32x2_splat
229 #define i16x4_splat u16x4_splat
230 #define i8x8_splat u8x8_splat
231 
232 #ifndef __ICC
233 always_inline u64x2
234 u64x2_read_lo (u64x2 x, u64 * a)
235 {
236  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
237 }
238 
239 always_inline u64x2
240 u64x2_read_hi (u64x2 x, u64 * a)
241 {
242  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
243 }
244 
245 always_inline void
246 u64x2_write_lo (u64x2 x, u64 * a)
247 {
248  _mm_storel_pi ((__m64 *) a, (__m128) x);
249 }
250 
251 always_inline void
252 u64x2_write_hi (u64x2 x, u64 * a)
253 {
254  _mm_storeh_pi ((__m64 *) a, (__m128) x);
255 }
256 #endif
257 
258 /* Unaligned loads/stores. */
259 
260 #define _(t) \
261  always_inline void t##_store_unaligned (t x, t * a) \
262  { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
263  always_inline t t##_load_unaligned (t * a) \
264  { return (t) _mm_loadu_si128 ((__m128i *) a); }
265 
266 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
267 #undef _
268 #define _signed_binop(n,m,f,g) \
269  /* Unsigned */ \
270  always_inline u##n##x##m \
271  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
272  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
273  \
274  /* Signed */ \
275  always_inline i##n##x##m \
276  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
277  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
278 /* Addition/subtraction. */
279  _signed_binop (8, 16, add, add_epi)
280 _signed_binop (16, 8, add, add_epi)
281 _signed_binop (32, 4, add, add_epi)
282 _signed_binop (64, 2, add, add_epi)
283 _signed_binop (8, 16, sub, sub_epi)
284 _signed_binop (16, 8, sub, sub_epi)
285 _signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
286 /* Addition/subtraction with saturation. */
287  _signed_binop (8, 16, add_saturate, adds_epu)
288 _signed_binop (16, 8, add_saturate, adds_epu)
289 _signed_binop (8, 16, sub_saturate, subs_epu)
290 _signed_binop (16, 8, sub_saturate, subs_epu)
291 /* Multiplication. */
292  always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
293 {
294  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
295 }
296 
297 always_inline u16x8
298 u16x8_mul_lo (u16x8 x, u16x8 y)
299 {
300  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
301 }
302 
305 {
306  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
307 }
308 
309 always_inline u16x8
310 u16x8_mul_hi (u16x8 x, u16x8 y)
311 {
312  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
313 }
314 
315 /* 128 bit shifts. */
316 
317 #define _(p,a,b,c,f) \
318  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
319  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
320  \
321  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
322  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
323 
324 _(u, 16, 8, left, sll)
325 _(u, 32, 4, left, sll)
326 _(u, 64, 2, left, sll)
327 _(u, 16, 8, right, srl)
328 _(u, 32, 4, right, srl)
329 _(u, 64, 2, right, srl)
330 _(i, 16, 8, left, sll)
331 _(i, 32, 4, left, sll)
332 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
333 #undef _
334 /* 64 bit shifts. */
335  always_inline u16x4
336 u16x4_shift_left (u16x4 x, u16x4 i)
337 {
338  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
339 };
340 
341 always_inline u32x2
342 u32x2_shift_left (u32x2 x, u32x2 i)
343 {
344  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
345 };
346 
347 always_inline u16x4
348 u16x4_shift_right (u16x4 x, u16x4 i)
349 {
350  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
351 };
352 
353 always_inline u32x2
354 u32x2_shift_right (u32x2 x, u32x2 i)
355 {
356  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
357 };
358 
359 always_inline i16x4
360 i16x4_shift_left (i16x4 x, i16x4 i)
361 {
362  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
363 };
364 
365 always_inline i32x2
366 i32x2_shift_left (i32x2 x, i32x2 i)
367 {
368  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
369 };
370 
371 always_inline i16x4
372 i16x4_shift_right (i16x4 x, i16x4 i)
373 {
374  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
375 };
376 
377 always_inline i32x2
378 i32x2_shift_right (i32x2 x, i32x2 i)
379 {
380  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
381 };
382 
383 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
384 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
385 
386 #define i8x16_word_shift_left(a,n) \
387  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
388 #define i8x16_word_shift_right(a,n) \
389  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
390 
391 #define u16x8_word_shift_left(a,n) \
392  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
393 #define i16x8_word_shift_left(a,n) \
394  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
395 #define u16x8_word_shift_right(a,n) \
396  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
397 #define i16x8_word_shift_right(a,n) \
398  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
399 
400 #define u32x4_word_shift_left(a,n) \
401  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
402 #define i32x4_word_shift_left(a,n) \
403  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
404 #define u32x4_word_shift_right(a,n) \
405  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
406 #define i32x4_word_shift_right(a,n) \
407  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
408 
409 #define u64x2_word_shift_left(a,n) \
410  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
411 #define i64x2_word_shift_left(a,n) \
412  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
413 #define u64x2_word_shift_right(a,n) \
414  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
415 #define i64x2_word_shift_right(a,n) \
416  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
417 
418 /* SSE2 has no rotate instructions: use shifts to simulate them. */
419 #define _(t,n,lr1,lr2) \
420  always_inline t##x##n \
421  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
422  { \
423  ASSERT (i >= 0 && i <= BITS (t)); \
424  return (t##x##n##_ishift_##lr1 (w, i) \
425  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
426  } \
427  \
428  always_inline t##x##n \
429  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
430  { \
431  t##x##n j = t##x##n##_splat (BITS (t)); \
432  return (t##x##n##_shift_##lr1 (w, i) \
433  | t##x##n##_shift_##lr2 (w, j - i)); \
434  }
435 
436 _(u16, 8, left, right);
437 _(u16, 8, right, left);
438 _(u32, 4, left, right);
439 _(u32, 4, right, left);
440 _(u64, 2, left, right);
441 _(u64, 2, right, left);
442 
443 #undef _
444 
445 #ifndef __clang__
446 #define _(t,n,lr1,lr2) \
447  always_inline t##x##n \
448  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
449  { \
450  int m = sizeof (t##x##n) / sizeof (t); \
451  ASSERT (i >= 0 && i < m); \
452  return (t##x##n##_word_shift_##lr1 (w0, i) \
453  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
454  } \
455  \
456  always_inline t##x##n \
457  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
458  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
459 
460 _(u8, 16, left, right);
461 _(u8, 16, right, left);
462 _(u16, 8, left, right);
463 _(u16, 8, right, left);
464 _(u32, 4, left, right);
465 _(u32, 4, right, left);
466 _(u64, 2, left, right);
467 _(u64, 2, right, left);
468 
469 #undef _
470 #endif
471 
472 /* Compare operations. */
473 always_inline u8x16
474 u8x16_is_equal (u8x16 x, u8x16 y)
475 {
476  return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
477 }
478 
479 always_inline i8x16
480 i8x16_is_equal (i8x16 x, i8x16 y)
481 {
482  return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
483 }
484 
485 always_inline u16x8
486 u16x8_is_equal (u16x8 x, u16x8 y)
487 {
488  return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
489 }
490 
493 {
494  return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
495 }
496 
499 {
500  return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
501 }
502 
505 {
506  return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
507 }
508 
509 always_inline u8x16
510 i8x16_is_greater (i8x16 x, i8x16 y)
511 {
512  return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
513 }
514 
515 always_inline u16x8
517 {
518  return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
519 }
520 
523 {
524  return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
525 }
526 
527 always_inline u8x16
528 u8x16_is_zero (u8x16 x)
529 {
530  u8x16 zero = { 0 };
531  return u8x16_is_equal (x, zero);
532 }
533 
534 always_inline u16x8
535 u16x8_is_zero (u16x8 x)
536 {
537  u16x8 zero = { 0 };
538  return u16x8_is_equal (x, zero);
539 }
540 
543 {
544  u32x4 zero = { 0 };
545  return u32x4_is_equal (x, zero);
546 }
547 
548 #define u32x4_select(A,MASK) \
549 ({ \
550  u32x4 _x, _y; \
551  _x = (A); \
552  asm volatile ("pshufd %[mask], %[x], %[y]" \
553  : /* outputs */ [y] "=x" (_y) \
554  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
555  _y; \
556 })
557 
558 #define u32x4_splat_word(x,i) \
559  u32x4_select ((x), (((i) << (2*0)) \
560  | ((i) << (2*1)) \
561  | ((i) << (2*2)) \
562  | ((i) << (2*3))))
563 
564 /* Extract low order 32 bit word. */
567 {
568  u32 result;
569  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
570  : /* inputs */ [x] "x" (x));
571  return result;
572 }
573 
576 {
577  u32x4 result;
578  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
579  : /* inputs */ [x] "r" (x));
580  return result;
581 }
582 
585 {
586  return (i32x4) u32x4_set0 ((u32) x);
587 }
588 
591 {
592  return (i32) u32x4_get0 ((u32x4) x);
593 }
594 
595 /* Converts all ones/zeros compare mask to bitmap. */
598 {
599  return _mm_movemask_epi8 ((__m128i) x);
600 }
601 
603 
606 {
607  u32 m = u8x16_compare_byte_mask ((u8x16) x);
608  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
609  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
610 }
611 
614 {
615  u8x16 zero = { 0 };
616  return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
617 }
618 
621 {
622  u16x8 zero = { 0 };
623  return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
624 }
625 
628 {
629  u32x4 zero = { 0 };
630  return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
631 }
632 
633 always_inline u8x16
634 u8x16_max (u8x16 x, u8x16 y)
635 {
636  return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
637 }
638 
641 {
642  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
643  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
644  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
645  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
646  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
647 }
648 
649 always_inline u8x16
650 u8x16_min (u8x16 x, u8x16 y)
651 {
652  return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
653 }
654 
657 {
658  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
659  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
660  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
661  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
662  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
663 }
664 
667 {
668  return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
669 }
670 
673 {
674  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
675  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
676  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
677  return _mm_extract_epi16 ((__m128i) x, 0);
678 }
679 
682 {
683  return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
684 }
685 
688 {
689  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
690  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
691  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
692  return _mm_extract_epi16 ((__m128i) x, 0);
693 }
694 
695 #undef _signed_binop
696 
697 #endif /* included_vector_sse2_h */
698 
699 /*
700  * fd.io coding-style-patch-verification: ON
701  *
702  * Local Variables:
703  * eval: (c-set-style "gnu")
704  * End:
705  */
static i16x8 i16x8_max(i16x8 x, i16x8 y)
Definition: vector_sse2.h:666
static u32x4 u32x4_is_equal(u32x4 x, u32x4 y)
Definition: vector_sse2.h:498
vmrglw vmrglh hi
#define i16x8_word_shift_right(a, n)
Definition: vector_sse2.h:397
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:337
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse2.h:52
#define u8x16_word_shift_right(a, n)
Definition: vector_sse2.h:384
a
Definition: bitmap.h:516
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse2.h:138
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse2.h:584
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse2.h:82
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse2.h:640
static u16x8 u16x8_is_zero(u16x8 x)
Definition: vector_sse2.h:535
sll right
Definition: vector_sse2.h:327
add_epi add_epi sub
Definition: vector_sse2.h:283
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse2.h:163
static u64x2 u64x2_splat(u64 a)
Definition: vector_sse2.h:176
static u8x16 u8x16_max(u8x16 x, u8x16 y)
Definition: vector_sse2.h:634
static u8x16 i8x16_is_greater(i8x16 x, i8x16 y)
Definition: vector_sse2.h:510
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse2.h:597
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse2.h:354
add_epi add_epi sub_epi sub_epi adds_epu subs_epu i16x8 y
Definition: vector_sse2.h:293
u8 u32x4_compare_word_mask_table[256]
add_epi add_epi sub_epi sub_epi add_saturate
Definition: vector_sse2.h:287
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse2.h:627
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse2.h:672
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse2.h:687
static i16x8 i16x8_is_equal(i16x8 x, i16x8 y)
Definition: vector_sse2.h:492
static u32x2 u32x2_splat(u32 a)
Definition: vector_sse2.h:204
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse2.h:360
static u32x4 u32x4_splat(u32 a)
Definition: vector_sse2.h:183
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse2.h:169
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse2.h:119
i32x4
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse2.h:605
#define always_inline
Definition: clib.h:84
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse2.h:113
unsigned long long u32x4
Definition: ixge.c:28
int i32
Definition: types.h:81
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse2.h:656
static u32x4 i32x4_is_greater(i32x4 x, i32x4 y)
Definition: vector_sse2.h:522
unsigned long u64
Definition: types.h:89
static i16x8 i16x8_min(i16x8 x, i16x8 y)
Definition: vector_sse2.h:681
left
Definition: vector_sse2.h:325
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse2.h:144
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse2.h:107
static u16x8 u16x8_is_equal(u16x8 x, u16x8 y)
Definition: vector_sse2.h:486
add
Definition: vector_sse2.h:279
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse2.h:575
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse2.h:95
static u8x16 u8x16_min(u8x16 x, u8x16 y)
Definition: vector_sse2.h:650
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse2.h:70
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse2.h:46
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse2.h:372
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse2.h:310
static u32x4 u32x4_is_zero(u32x4 x)
Definition: vector_sse2.h:542
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse2.h:88
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse2.h:64
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse2.h:366
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:234
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:246
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse2.h:157
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse2.h:151
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:240
static u16x8 u16x8_splat(u16 a)
Definition: vector_sse2.h:190
static u8x16 u8x16_is_equal(u8x16 x, u8x16 y)
Definition: vector_sse2.h:474
add_epi add_epi sub_epi sub_epi adds_epu sub_saturate
Definition: vector_sse2.h:289
unsigned int u32
Definition: types.h:88
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse2.h:76
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse2.h:132
vmrglw i16x8
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse2.h:378
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse2.h:620
static u8x16 u8x16_splat(u8 a)
Definition: vector_sse2.h:197
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse2.h:342
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:252
unsigned short u16
Definition: types.h:57
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse2.h:58
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse2.h:590
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse2.h:101
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse2.h:613
unsigned char u8
Definition: types.h:56
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse2.h:566
static i32x4 i32x4_is_equal(i32x4 x, i32x4 y)
Definition: vector_sse2.h:504
static u16x8 i16x8_is_greater(i16x8 x, i16x8 y)
Definition: vector_sse2.h:516
short i16
Definition: types.h:46
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse2.h:125
static u8x16 u8x16_is_zero(u8x16 x)
Definition: vector_sse2.h:528
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse2.h:348
static i8x16 i8x16_is_equal(i8x16 x, i8x16 y)
Definition: vector_sse2.h:480
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse2.h:304
static u8x8 u8x8_splat(u8 a)
Definition: vector_sse2.h:218
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse2.h:298
static u16x4 u16x4_splat(u16 a)
Definition: vector_sse2.h:211