48 #ifndef included_clib_memcpy_avx512_h 49 #define included_clib_memcpy_avx512_h 52 #include <x86intrin.h> 59 xmm0 = _mm_loadu_si128 ((
const __m128i *) src);
60 _mm_storeu_si128 ((__m128i *) dst, xmm0);
68 ymm0 = _mm256_loadu_si256 ((
const __m256i *) src);
69 _mm256_storeu_si256 ((__m256i *) dst, ymm0);
77 zmm0 = _mm512_loadu_si512 ((
const void *) src);
78 _mm512_storeu_si512 ((
void *) dst, zmm0);
102 zmm0 = _mm512_loadu_si512 ((
const void *) (src + 0 * 64));
104 zmm1 = _mm512_loadu_si512 ((
const void *) (src + 1 * 64));
106 _mm512_storeu_si512 ((
void *) (dst + 0 * 64), zmm0);
107 _mm512_storeu_si512 ((
void *) (dst + 1 * 64), zmm1);
115 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
119 zmm0 = _mm512_loadu_si512 ((
const void *) (src + 0 * 64));
121 zmm1 = _mm512_loadu_si512 ((
const void *) (src + 1 * 64));
122 zmm2 = _mm512_loadu_si512 ((
const void *) (src + 2 * 64));
123 zmm3 = _mm512_loadu_si512 ((
const void *) (src + 3 * 64));
124 zmm4 = _mm512_loadu_si512 ((
const void *) (src + 4 * 64));
125 zmm5 = _mm512_loadu_si512 ((
const void *) (src + 5 * 64));
126 zmm6 = _mm512_loadu_si512 ((
const void *) (src + 6 * 64));
127 zmm7 = _mm512_loadu_si512 ((
const void *) (src + 7 * 64));
129 _mm512_storeu_si512 ((
void *) (dst + 0 * 64), zmm0);
130 _mm512_storeu_si512 ((
void *) (dst + 1 * 64), zmm1);
131 _mm512_storeu_si512 ((
void *) (dst + 2 * 64), zmm2);
132 _mm512_storeu_si512 ((
void *) (dst + 3 * 64), zmm3);
133 _mm512_storeu_si512 ((
void *) (dst + 4 * 64), zmm4);
134 _mm512_storeu_si512 ((
void *) (dst + 5 * 64), zmm5);
135 _mm512_storeu_si512 ((
void *) (dst + 6 * 64), zmm6);
136 _mm512_storeu_si512 ((
void *) (dst + 7 * 64), zmm7);
157 *(
u8 *) dstu = *(
const u8 *) srcu;
158 srcu = (
uword) ((
const u8 *) srcu + 1);
159 dstu = (
uword) ((
u8 *) dstu + 1);
163 *(
u16 *) dstu = *(
const u16 *) srcu;
164 srcu = (
uword) ((
const u16 *) srcu + 1);
169 *(
u32 *) dstu = *(
const u32 *) srcu;
170 srcu = (
uword) ((
const u32 *) srcu + 1);
174 *(
u64 *) dstu = *(
const u64 *) srcu;
199 src = (
const u8 *) src + 256;
200 dst = (
u8 *) dst + 256;
206 src = (
const u8 *) src + 128;
207 dst = (
u8 *) dst + 128;
209 COPY_BLOCK_128_BACK63:
224 dstofss = (
uword) dst & 0x3F;
227 dstofss = 64 - dstofss;
230 src = (
const u8 *) src + dstofss;
231 dst = (
u8 *) dst + dstofss;
243 src = (
const u8 *) src + bits;
244 dst = (
u8 *) dst + bits;
257 src = (
const u8 *) src + bits;
258 dst = (
u8 *) dst + bits;
264 goto COPY_BLOCK_128_BACK63;
static void clib_mov16(u8 *dst, const u8 *src)
static void clib_mov64(u8 *dst, const u8 *src)
static void clib_mov32(u8 *dst, const u8 *src)
static void clib_mov256(u8 *dst, const u8 *src)
static void clib_mov128(u8 *dst, const u8 *src)
static void clib_mov512blocks(u8 *dst, const u8 *src, size_t n)
static void clib_mov128blocks(u8 *dst, const u8 *src, size_t n)
static void * clib_memcpy_fast(void *dst, const void *src, size_t n)