FD.io VPP  v19.08.3-2-gbabecb413
Vector Packet Processing
memcpy_sse3.h File Reference
+ Include dependency graph for memcpy_sse3.h:

Go to the source code of this file.

Macros

#define CLIB_MVUNALIGN_LEFT47_IMM(dst, src, len, offset)
 Macro for copying unaligned block from one location to another with constant load offset, 47 bytes leftover maximum, locations should not overlap. More...
 
#define CLIB_MVUNALIGN_LEFT47(dst, src, len, offset)
 Macro for copying unaligned block from one location to another, 47 bytes leftover maximum, locations should not overlap. More...
 

Functions

static void clib_mov16 (u8 *dst, const u8 *src)
 
static void clib_mov32 (u8 *dst, const u8 *src)
 
static void clib_mov64 (u8 *dst, const u8 *src)
 
static void clib_mov128 (u8 *dst, const u8 *src)
 
static void clib_mov256 (u8 *dst, const u8 *src)
 
static void * clib_memcpy_fast (void *dst, const void *src, size_t n)
 

Macro Definition Documentation

◆ CLIB_MVUNALIGN_LEFT47

#define CLIB_MVUNALIGN_LEFT47 (   dst,
  src,
  len,
  offset 
)
Value:
({ \
switch (offset) { \
case 0x01: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x01); break; \
case 0x02: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x02); break; \
case 0x03: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x03); break; \
case 0x04: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x04); break; \
case 0x05: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x05); break; \
case 0x06: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x06); break; \
case 0x07: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x07); break; \
case 0x08: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x08); break; \
case 0x09: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x09); break; \
case 0x0A: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0A); break; \
case 0x0B: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0B); break; \
case 0x0C: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0C); break; \
case 0x0D: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0D); break; \
case 0x0E: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0E); break; \
case 0x0F: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0F); break; \
default:; \
} \
})
vl_api_address_t src
Definition: gre.api:51
#define CLIB_MVUNALIGN_LEFT47_IMM(dst, src, len, offset)
Macro for copying unaligned block from one location to another with constant load offset...
Definition: memcpy_sse3.h:107
vl_api_address_t dst
Definition: gre.api:52
template key/value backing page structure
Definition: bihash_doc.h:44

Macro for copying unaligned block from one location to another, 47 bytes leftover maximum, locations should not overlap.

Use switch here because the aligning instruction requires immediate value for shift count. Requirements:

  • Store is aligned
  • Load offset is <offset>, which must be within [1, 15]
  • For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
  • <dst>, <src>, <len> must be variables
  • __m128i <xmm0> ~ <xmm8> used in CLIB_MVUNALIGN_LEFT47_IMM must be pre-defined

Definition at line 168 of file memcpy_sse3.h.

◆ CLIB_MVUNALIGN_LEFT47_IMM

#define CLIB_MVUNALIGN_LEFT47_IMM (   dst,
  src,
  len,
  offset 
)
Value:
({ \
int tmp; \
while (len >= 128 + 16 - offset) { \
xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
len -= 128; \
xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
xmm3 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 3 * 16)); \
xmm4 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 4 * 16)); \
xmm5 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 5 * 16)); \
xmm6 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 6 * 16)); \
xmm7 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 7 * 16)); \
xmm8 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 8 * 16)); \
src = (const u8 *)src + 128; \
_mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
dst = (u8 *)dst + 128; \
} \
tmp = len; \
len = ((len - 16 + offset) & 127) + 16 - offset; \
tmp -= len; \
src = (const u8 *)src + tmp; \
dst = (u8 *)dst + tmp; \
if (len >= 32 + 16 - offset) { \
while (len >= 32 + 16 - offset) { \
xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
len -= 32; \
xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
src = (const u8 *)src + 32; \
_mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
_mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
dst = (u8 *)dst + 32; \
} \
tmp = len; \
len = ((len - 16 + offset) & 31) + 16 - offset; \
tmp -= len; \
src = (const u8 *)src + tmp; \
dst = (u8 *)dst + tmp; \
} \
})
vl_api_address_t src
Definition: gre.api:51
unsigned char u8
Definition: types.h:56
vl_api_address_t dst
Definition: gre.api:52
u8 len
Definition: ip_types.api:90
template key/value backing page structure
Definition: bihash_doc.h:44
struct clib_bihash_value offset
template key/value backing page structure

Macro for copying unaligned block from one location to another with constant load offset, 47 bytes leftover maximum, locations should not overlap.

Requirements:

  • Store is aligned
  • Load offset is <offset>, which must be immediate value within [1, 15]
  • For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
  • <dst>, <src>, <len> must be variables
  • __m128i <xmm0> ~ <xmm8> must be pre-defined

Definition at line 107 of file memcpy_sse3.h.

Function Documentation

◆ clib_memcpy_fast()

static void* clib_memcpy_fast ( void *  dst,
const void *  src,
size_t  n 
)
inlinestatic

Copy less than 16 bytes

Fast way when copy size doesn't exceed 512 bytes

Make store aligned when copy size exceeds 512 bytes, and make sure the first 15 bytes are copied, because unaligned copy functions require up to 15 bytes backwards access.

For aligned copy

Copy 256-byte blocks

Copy whatever left

For copy with unaligned load

Copy whatever left

Definition at line 191 of file memcpy_sse3.h.

+ Here is the call graph for this function:

◆ clib_mov128()

static void clib_mov128 ( u8 dst,
const u8 src 
)
inlinestatic

Definition at line 83 of file memcpy_sse3.h.

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ clib_mov16()

static void clib_mov16 ( u8 dst,
const u8 src 
)
inlinestatic

Definition at line 60 of file memcpy_sse3.h.

+ Here is the caller graph for this function:

◆ clib_mov256()

static void clib_mov256 ( u8 dst,
const u8 src 
)
inlinestatic

Definition at line 90 of file memcpy_sse3.h.

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ clib_mov32()

static void clib_mov32 ( u8 dst,
const u8 src 
)
inlinestatic

Definition at line 69 of file memcpy_sse3.h.

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ clib_mov64()

static void clib_mov64 ( u8 dst,
const u8 src 
)
inlinestatic

Definition at line 76 of file memcpy_sse3.h.

+ Here is the call graph for this function:
+ Here is the caller graph for this function: