48 #ifndef included_clib_memcpy_avx512_h 49 #define included_clib_memcpy_avx512_h 52 #include <x86intrin.h> 64 xmm0 = _mm_loadu_si128 ((
const __m128i *) src);
65 _mm_storeu_si128 ((__m128i *) dst, xmm0);
73 ymm0 = _mm256_loadu_si256 ((
const __m256i *) src);
74 _mm256_storeu_si256 ((__m256i *) dst, ymm0);
82 zmm0 = _mm512_loadu_si512 ((
const void *) src);
83 _mm512_storeu_si512 ((
void *) dst, zmm0);
107 zmm0 = _mm512_loadu_si512 ((
const void *) (src + 0 * 64));
109 zmm1 = _mm512_loadu_si512 ((
const void *) (src + 1 * 64));
111 _mm512_storeu_si512 ((
void *) (dst + 0 * 64), zmm0);
112 _mm512_storeu_si512 ((
void *) (dst + 1 * 64), zmm1);
120 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
124 zmm0 = _mm512_loadu_si512 ((
const void *) (src + 0 * 64));
126 zmm1 = _mm512_loadu_si512 ((
const void *) (src + 1 * 64));
127 zmm2 = _mm512_loadu_si512 ((
const void *) (src + 2 * 64));
128 zmm3 = _mm512_loadu_si512 ((
const void *) (src + 3 * 64));
129 zmm4 = _mm512_loadu_si512 ((
const void *) (src + 4 * 64));
130 zmm5 = _mm512_loadu_si512 ((
const void *) (src + 5 * 64));
131 zmm6 = _mm512_loadu_si512 ((
const void *) (src + 6 * 64));
132 zmm7 = _mm512_loadu_si512 ((
const void *) (src + 7 * 64));
134 _mm512_storeu_si512 ((
void *) (dst + 0 * 64), zmm0);
135 _mm512_storeu_si512 ((
void *) (dst + 1 * 64), zmm1);
136 _mm512_storeu_si512 ((
void *) (dst + 2 * 64), zmm2);
137 _mm512_storeu_si512 ((
void *) (dst + 3 * 64), zmm3);
138 _mm512_storeu_si512 ((
void *) (dst + 4 * 64), zmm4);
139 _mm512_storeu_si512 ((
void *) (dst + 5 * 64), zmm5);
140 _mm512_storeu_si512 ((
void *) (dst + 6 * 64), zmm6);
141 _mm512_storeu_si512 ((
void *) (dst + 7 * 64), zmm7);
162 *(
u8 *) dstu = *(
const u8 *) srcu;
163 srcu = (
uword) ((
const u8 *) srcu + 1);
164 dstu = (
uword) ((
u8 *) dstu + 1);
168 *(
u16 *) dstu = *(
const u16 *) srcu;
169 srcu = (
uword) ((
const u16 *) srcu + 1);
174 *(
u32 *) dstu = *(
const u32 *) srcu;
175 srcu = (
uword) ((
const u32 *) srcu + 1);
179 *(
u64 *) dstu = *(
const u64 *) srcu;
204 src = (
const u8 *) src + 256;
205 dst = (
u8 *) dst + 256;
211 src = (
const u8 *) src + 128;
212 dst = (
u8 *) dst + 128;
214 COPY_BLOCK_128_BACK63:
229 dstofss = (
uword) dst & 0x3F;
232 dstofss = 64 - dstofss;
235 src = (
const u8 *) src + dstofss;
236 dst = (
u8 *) dst + dstofss;
248 src = (
const u8 *) src + bits;
249 dst = (
u8 *) dst + bits;
262 src = (
const u8 *) src + bits;
263 dst = (
u8 *) dst + bits;
269 goto COPY_BLOCK_128_BACK63;
static void clib_mov16(u8 *dst, const u8 *src)
static void clib_mov64(u8 *dst, const u8 *src)
static void clib_mov32(u8 *dst, const u8 *src)
static void clib_mov256(u8 *dst, const u8 *src)
static void clib_mov128(u8 *dst, const u8 *src)
static void clib_mov512blocks(u8 *dst, const u8 *src, size_t n)
static void clib_mov128blocks(u8 *dst, const u8 *src, size_t n)
static void * clib_memcpy_fast(void *dst, const void *src, size_t n)