FD.io VPP
v21.06-3-gbb25fbf28
Vector Packet Processing
memcpy_sse3.h
Go to the documentation of this file.
1
/*
2
* Copyright (c) 2016 Cisco and/or its affiliates.
3
* Licensed under the Apache License, Version 2.0 (the "License");
4
* you may not use this file except in compliance with the License.
5
* You may obtain a copy of the License at:
6
*
7
* http://www.apache.org/licenses/LICENSE-2.0
8
*
9
* Unless required by applicable law or agreed to in writing, software
10
* distributed under the License is distributed on an "AS IS" BASIS,
11
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
* See the License for the specific language governing permissions and
13
* limitations under the License.
14
*/
15
/*-
16
* BSD LICENSE
17
*
18
* Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
19
* All rights reserved.
20
*
21
* Redistribution and use in source and binary forms, with or without
22
* modification, are permitted provided that the following conditions
23
* are met:
24
*
25
* * Redistributions of source code must retain the above copyright
26
* notice, this list of conditions and the following disclaimer.
27
* * Redistributions in binary form must reproduce the above copyright
28
* notice, this list of conditions and the following disclaimer in
29
* the documentation and/or other materials provided with the
30
* distribution.
31
* * Neither the name of Intel Corporation nor the names of its
32
* contributors may be used to endorse or promote products derived
33
* from this software without specific prior written permission.
34
*
35
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
*/
47
48
#ifndef included_clib_memcpy_sse3_h
49
#define included_clib_memcpy_sse3_h
50
51
#include <stdint.h>
52
#include <x86intrin.h>
53
#include <
vppinfra/warnings.h
>
54
55
/* *INDENT-OFF* */
56
WARN_OFF
(stringop-overflow)
57
/* *INDENT-ON* */
58
59
static
inline
void
60
clib_mov16
(
u8
*
dst
,
const
u8
*
src
)
61
{
62
__m128i xmm0;
63
64
xmm0 = _mm_loadu_si128 ((
const
__m128i *)
src
);
65
_mm_storeu_si128 ((__m128i *)
dst
, xmm0);
66
}
67
68
static
inline
void
69
clib_mov32
(
u8
*
dst
,
const
u8
*
src
)
70
{
71
clib_mov16
((
u8
*)
dst
+ 0 * 16, (
const
u8
*)
src
+ 0 * 16);
72
clib_mov16
((
u8
*)
dst
+ 1 * 16, (
const
u8
*)
src
+ 1 * 16);
73
}
74
75
static
inline
void
76
clib_mov64
(
u8
*
dst
,
const
u8
*
src
)
77
{
78
clib_mov32
((
u8
*)
dst
+ 0 * 32, (
const
u8
*)
src
+ 0 * 32);
79
clib_mov32
((
u8
*)
dst
+ 1 * 32, (
const
u8
*)
src
+ 1 * 32);
80
}
81
82
static
inline
void
83
clib_mov128
(
u8
*
dst
,
const
u8
*
src
)
84
{
85
clib_mov64
((
u8
*)
dst
+ 0 * 64, (
const
u8
*)
src
+ 0 * 64);
86
clib_mov64
((
u8
*)
dst
+ 1 * 64, (
const
u8
*)
src
+ 1 * 64);
87
}
88
89
static
inline
void
90
clib_mov256
(
u8
*
dst
,
const
u8
*
src
)
91
{
92
clib_mov128
((
u8
*)
dst
+ 0 * 128, (
const
u8
*)
src
+ 0 * 128);
93
clib_mov128
((
u8
*)
dst
+ 1 * 128, (
const
u8
*)
src
+ 1 * 128);
94
}
95
96
/**
97
* Macro for copying unaligned block from one location to another with constant load offset,
98
* 47 bytes leftover maximum,
99
* locations should not overlap.
100
* Requirements:
101
* - Store is aligned
102
* - Load offset is <offset>, which must be immediate value within [1, 15]
103
* - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
104
* - <dst>, <src>, <len> must be variables
105
* - __m128i <xmm0> ~ <xmm8> must be pre-defined
106
*/
107
#define CLIB_MVUNALIGN_LEFT47_IMM(dst, src, len, offset) \
108
({ \
109
int tmp; \
110
while (len >= 128 + 16 - offset) { \
111
xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
112
len -= 128; \
113
xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
114
xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
115
xmm3 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 3 * 16)); \
116
xmm4 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 4 * 16)); \
117
xmm5 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 5 * 16)); \
118
xmm6 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 6 * 16)); \
119
xmm7 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 7 * 16)); \
120
xmm8 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 8 * 16)); \
121
src = (const u8 *)src + 128; \
122
_mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
123
_mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
124
_mm_storeu_si128((__m128i *)((u8 *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
125
_mm_storeu_si128((__m128i *)((u8 *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
126
_mm_storeu_si128((__m128i *)((u8 *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
127
_mm_storeu_si128((__m128i *)((u8 *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
128
_mm_storeu_si128((__m128i *)((u8 *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
129
_mm_storeu_si128((__m128i *)((u8 *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
130
dst = (u8 *)dst + 128; \
131
} \
132
tmp = len; \
133
len = ((len - 16 + offset) & 127) + 16 - offset; \
134
tmp -= len; \
135
src = (const u8 *)src + tmp; \
136
dst = (u8 *)dst + tmp; \
137
if (len >= 32 + 16 - offset) { \
138
while (len >= 32 + 16 - offset) { \
139
xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
140
len -= 32; \
141
xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
142
xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
143
src = (const u8 *)src + 32; \
144
_mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
145
_mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
146
dst = (u8 *)dst + 32; \
147
} \
148
tmp = len; \
149
len = ((len - 16 + offset) & 31) + 16 - offset; \
150
tmp -= len; \
151
src = (const u8 *)src + tmp; \
152
dst = (u8 *)dst + tmp; \
153
} \
154
})
155
156
/**
157
* Macro for copying unaligned block from one location to another,
158
* 47 bytes leftover maximum,
159
* locations should not overlap.
160
* Use switch here because the aligning instruction requires immediate value for shift count.
161
* Requirements:
162
* - Store is aligned
163
* - Load offset is <offset>, which must be within [1, 15]
164
* - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
165
* - <dst>, <src>, <len> must be variables
166
* - __m128i <xmm0> ~ <xmm8> used in CLIB_MVUNALIGN_LEFT47_IMM must be pre-defined
167
*/
168
#define CLIB_MVUNALIGN_LEFT47(dst, src, len, offset) \
169
({ \
170
switch (offset) { \
171
case 0x01: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x01); break; \
172
case 0x02: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x02); break; \
173
case 0x03: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x03); break; \
174
case 0x04: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x04); break; \
175
case 0x05: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x05); break; \
176
case 0x06: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x06); break; \
177
case 0x07: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x07); break; \
178
case 0x08: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x08); break; \
179
case 0x09: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x09); break; \
180
case 0x0A: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0A); break; \
181
case 0x0B: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0B); break; \
182
case 0x0C: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0C); break; \
183
case 0x0D: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0D); break; \
184
case 0x0E: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0E); break; \
185
case 0x0F: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0F); break; \
186
default:; \
187
} \
188
})
189
190
static
inline
void
*
191
clib_memcpy_fast_sse3
(
void
*
dst
,
const
void
*
src
,
size_t
n)
192
{
193
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
194
uword
dstu = (
uword
)
dst
;
195
uword
srcu = (
uword
)
src
;
196
void
*ret =
dst
;
197
size_t
dstofss;
198
size_t
srcofs;
199
200
/**
201
* Copy less than 16 bytes
202
*/
203
if
(n < 16)
204
{
205
if
(n & 0x01)
206
{
207
*(
u8
*) dstu = *(
const
u8
*) srcu;
208
srcu = (
uword
) ((
const
u8
*) srcu + 1);
209
dstu = (
uword
) ((
u8
*) dstu + 1);
210
}
211
if
(n & 0x02)
212
{
213
*(
u16
*) dstu = *(
const
u16
*) srcu;
214
srcu = (
uword
) ((
const
u16
*) srcu + 1);
215
dstu = (
uword
) ((
u16
*) dstu + 1);
216
}
217
if
(n & 0x04)
218
{
219
*(
u32
*) dstu = *(
const
u32
*) srcu;
220
srcu = (
uword
) ((
const
u32
*) srcu + 1);
221
dstu = (
uword
) ((
u32
*) dstu + 1);
222
}
223
if
(n & 0x08)
224
{
225
*(
u64
*) dstu = *(
const
u64
*) srcu;
226
}
227
return
ret;
228
}
229
230
/**
231
* Fast way when copy size doesn't exceed 512 bytes
232
*/
233
if
(n <= 32)
234
{
235
clib_mov16
((
u8
*)
dst
, (
const
u8
*)
src
);
236
clib_mov16
((
u8
*)
dst
- 16 + n, (
const
u8
*)
src
- 16 + n);
237
return
ret;
238
}
239
if
(n <= 48)
240
{
241
clib_mov32
((
u8
*)
dst
, (
const
u8
*)
src
);
242
clib_mov16
((
u8
*)
dst
- 16 + n, (
const
u8
*)
src
- 16 + n);
243
return
ret;
244
}
245
if
(n <= 64)
246
{
247
clib_mov32
((
u8
*)
dst
, (
const
u8
*)
src
);
248
clib_mov16
((
u8
*)
dst
+ 32, (
const
u8
*)
src
+ 32);
249
clib_mov16
((
u8
*)
dst
- 16 + n, (
const
u8
*)
src
- 16 + n);
250
return
ret;
251
}
252
if
(n <= 128)
253
{
254
goto
COPY_BLOCK_128_BACK15;
255
}
256
if
(n <= 512)
257
{
258
if
(n >= 256)
259
{
260
n -= 256;
261
clib_mov128
((
u8
*)
dst
, (
const
u8
*)
src
);
262
clib_mov128
((
u8
*)
dst
+ 128, (
const
u8
*)
src
+ 128);
263
src
= (
const
u8
*)
src
+ 256;
264
dst
= (
u8
*)
dst
+ 256;
265
}
266
COPY_BLOCK_255_BACK15:
267
if
(n >= 128)
268
{
269
n -= 128;
270
clib_mov128
((
u8
*)
dst
, (
const
u8
*)
src
);
271
src
= (
const
u8
*)
src
+ 128;
272
dst
= (
u8
*)
dst
+ 128;
273
}
274
COPY_BLOCK_128_BACK15:
275
if
(n >= 64)
276
{
277
n -= 64;
278
clib_mov64
((
u8
*)
dst
, (
const
u8
*)
src
);
279
src
= (
const
u8
*)
src
+ 64;
280
dst
= (
u8
*)
dst
+ 64;
281
}
282
COPY_BLOCK_64_BACK15:
283
if
(n >= 32)
284
{
285
n -= 32;
286
clib_mov32
((
u8
*)
dst
, (
const
u8
*)
src
);
287
src
= (
const
u8
*)
src
+ 32;
288
dst
= (
u8
*)
dst
+ 32;
289
}
290
if
(n > 16)
291
{
292
clib_mov16
((
u8
*)
dst
, (
const
u8
*)
src
);
293
clib_mov16
((
u8
*)
dst
- 16 + n, (
const
u8
*)
src
- 16 + n);
294
return
ret;
295
}
296
if
(n > 0)
297
{
298
clib_mov16
((
u8
*)
dst
- 16 + n, (
const
u8
*)
src
- 16 + n);
299
}
300
return
ret;
301
}
302
303
/**
304
* Make store aligned when copy size exceeds 512 bytes,
305
* and make sure the first 15 bytes are copied, because
306
* unaligned copy functions require up to 15 bytes
307
* backwards access.
308
*/
309
dstofss = (
uword
)
dst
& 0x0F;
310
if
(dstofss > 0)
311
{
312
dstofss = 16 - dstofss + 16;
313
n -= dstofss;
314
clib_mov32
((
u8
*)
dst
, (
const
u8
*)
src
);
315
src
= (
const
u8
*)
src
+ dstofss;
316
dst
= (
u8
*)
dst
+ dstofss;
317
}
318
srcofs = ((
uword
)
src
& 0x0F);
319
320
/**
321
* For aligned copy
322
*/
323
if
(srcofs == 0)
324
{
325
/**
326
* Copy 256-byte blocks
327
*/
328
for
(; n >= 256; n -= 256)
329
{
330
clib_mov256
((
u8
*)
dst
, (
const
u8
*)
src
);
331
dst
= (
u8
*)
dst
+ 256;
332
src
= (
const
u8
*)
src
+ 256;
333
}
334
335
/**
336
* Copy whatever left
337
*/
338
goto
COPY_BLOCK_255_BACK15;
339
}
340
341
/**
342
* For copy with unaligned load
343
*/
344
CLIB_MVUNALIGN_LEFT47
(
dst
,
src
, n, srcofs);
345
346
/**
347
* Copy whatever left
348
*/
349
goto
COPY_BLOCK_64_BACK15;
350
}
351
352
/* *INDENT-OFF* */
353
WARN_ON
(stringop-overflow)
354
/* *INDENT-ON* */
355
356
#undef CLIB_MVUNALIGN_LEFT47_IMM
357
#undef CLIB_MVUNALIGN_LEFT47
358
359
#endif
/* included_clib_memcpy_sse3_h */
360
361
362
/*
363
* fd.io coding-style-patch-verification: ON
364
*
365
* Local Variables:
366
* eval: (c-set-style "gnu")
367
* End:
368
*/
clib_mov128
static void clib_mov128(u8 *dst, const u8 *src)
Definition:
memcpy_sse3.h:83
clib_mov64
static void clib_mov64(u8 *dst, const u8 *src)
Definition:
memcpy_sse3.h:76
clib_mov16
static void clib_mov16(u8 *dst, const u8 *src)
Definition:
memcpy_sse3.h:60
u16
unsigned short u16
Definition:
types.h:57
clib_memcpy_fast_sse3
static void * clib_memcpy_fast_sse3(void *dst, const void *src, size_t n)
Definition:
memcpy_sse3.h:191
clib_mov256
static void clib_mov256(u8 *dst, const u8 *src)
Definition:
memcpy_sse3.h:90
clib_mov32
static void clib_mov32(u8 *dst, const u8 *src)
Definition:
memcpy_sse3.h:69
uword
u64 uword
Definition:
types.h:112
src
vl_api_address_t src
Definition:
gre.api:54
WARN_ON
#define WARN_ON(x)
Definition:
warnings.h:81
CLIB_MVUNALIGN_LEFT47
#define CLIB_MVUNALIGN_LEFT47(dst, src, len, offset)
Macro for copying unaligned block from one location to another, 47 bytes leftover maximum,...
Definition:
memcpy_sse3.h:168
u64
unsigned long u64
Definition:
types.h:89
WARN_OFF
#define WARN_OFF(x)
Definition:
warnings.h:80
u32
unsigned int u32
Definition:
types.h:88
dst
vl_api_ip4_address_t dst
Definition:
pnat.api:41
u8
unsigned char u8
Definition:
types.h:56
warnings.h
src
vppinfra
memcpy_sse3.h
Generated on Sat Jan 8 2022 10:05:47 for FD.io VPP by
1.8.17