48 #ifndef included_clib_memcpy_avx_h 49 #define included_clib_memcpy_avx_h 52 #include <x86intrin.h> 59 xmm0 = _mm_loadu_si128((
const __m128i *)src);
60 _mm_storeu_si128((__m128i *)dst, xmm0);
68 ymm0 = _mm256_loadu_si256((
const __m256i *)src);
69 _mm256_storeu_si256((__m256i *)dst, ymm0);
99 ymm0 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 0 * 32));
101 ymm1 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 1 * 32));
102 src = (
const u8 *)src + 64;
103 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 0 * 32), ymm0);
104 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 1 * 32), ymm1);
105 dst = (
u8 *)dst + 64;
112 __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
115 ymm0 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 0 * 32));
117 ymm1 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 1 * 32));
118 ymm2 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 2 * 32));
119 ymm3 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 3 * 32));
120 ymm4 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 4 * 32));
121 ymm5 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 5 * 32));
122 ymm6 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 6 * 32));
123 ymm7 = _mm256_loadu_si256((
const __m256i *)((
const u8 *)src + 7 * 32));
124 src = (
const u8 *)src + 256;
125 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 0 * 32), ymm0);
126 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 1 * 32), ymm1);
127 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 2 * 32), ymm2);
128 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 3 * 32), ymm3);
129 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 4 * 32), ymm4);
130 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 5 * 32), ymm5);
131 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 6 * 32), ymm6);
132 _mm256_storeu_si256((__m256i *)((
u8 *)dst + 7 * 32), ymm7);
133 dst = (
u8 *)dst + 256;
151 *(
u8 *)dstu = *(
const u8 *)srcu;
152 srcu = (
uword)((
const u8 *)srcu + 1);
153 dstu = (
uword)((
u8 *)dstu + 1);
166 *(uint64_t *)dstu = *(
const uint64_t *)srcu;
188 src = (
const u8 *)src + 256;
189 dst = (
u8 *)dst + 256;
194 src = (
const u8 *)src + 128;
195 dst = (
u8 *)dst + 128;
200 src = (
const u8 *)src + 64;
201 dst = (
u8 *)dst + 64;
203 COPY_BLOCK_64_BACK31:
218 dstofss = (
uword)dst & 0x1F;
220 dstofss = 32 - dstofss;
223 src = (
const u8 *)src + dstofss;
224 dst = (
u8 *)dst + dstofss;
236 src = (
const u8 *)src + bits;
237 dst = (
u8 *)dst + bits;
249 src = (
const u8 *)src + bits;
250 dst = (
u8 *)dst + bits;
256 goto COPY_BLOCK_64_BACK31;
static void clib_mov64(u8 *dst, const u8 *src)
static void * clib_memcpy(void *dst, const void *src, size_t n)
unsigned short int uint16_t
static void clib_mov64blocks(u8 *dst, const u8 *src, size_t n)
static void clib_mov16(u8 *dst, const u8 *src)
static void clib_mov256blocks(u8 *dst, const u8 *src, size_t n)
static void clib_mov32(u8 *dst, const u8 *src)
static void clib_mov128(u8 *dst, const u8 *src)
static void clib_mov256(u8 *dst, const u8 *src)