FD.io VPP  v20.01-48-g3e0dafb74
Vector Packet Processing
memcpy_avx2.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*-
16  * BSD LICENSE
17  *
18  * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
19  * All rights reserved.
20  *
21  * Redistribution and use in source and binary forms, with or without
22  * modification, are permitted provided that the following conditions
23  * are met:
24  *
25  * * Redistributions of source code must retain the above copyright
26  * notice, this list of conditions and the following disclaimer.
27  * * Redistributions in binary form must reproduce the above copyright
28  * notice, this list of conditions and the following disclaimer in
29  * the documentation and/or other materials provided with the
30  * distribution.
31  * * Neither the name of Intel Corporation nor the names of its
32  * contributors may be used to endorse or promote products derived
33  * from this software without specific prior written permission.
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46  */
47 
48 #ifndef included_clib_memcpy_avx2_h
49 #define included_clib_memcpy_avx2_h
50 
51 #include <stdint.h>
52 #include <x86intrin.h>
53 
54 static inline void
55 clib_mov16 (u8 * dst, const u8 * src)
56 {
57  __m128i xmm0;
58 
59  xmm0 = _mm_loadu_si128 ((const __m128i *) src);
60  _mm_storeu_si128 ((__m128i *) dst, xmm0);
61 }
62 
63 static inline void
64 clib_mov32 (u8 * dst, const u8 * src)
65 {
66  __m256i ymm0;
67 
68  ymm0 = _mm256_loadu_si256 ((const __m256i *) src);
69  _mm256_storeu_si256 ((__m256i *) dst, ymm0);
70 }
71 
72 static inline void
73 clib_mov64 (u8 * dst, const u8 * src)
74 {
75  clib_mov32 ((u8 *) dst + 0 * 32, (const u8 *) src + 0 * 32);
76  clib_mov32 ((u8 *) dst + 1 * 32, (const u8 *) src + 1 * 32);
77 }
78 
79 static inline void
80 clib_mov128 (u8 * dst, const u8 * src)
81 {
82  clib_mov64 ((u8 *) dst + 0 * 64, (const u8 *) src + 0 * 64);
83  clib_mov64 ((u8 *) dst + 1 * 64, (const u8 *) src + 1 * 64);
84 }
85 
86 static inline void
87 clib_mov128blocks (u8 * dst, const u8 * src, size_t n)
88 {
89  __m256i ymm0, ymm1, ymm2, ymm3;
90 
91  while (n >= 128)
92  {
93  ymm0 =
94  _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 0 * 32));
95  n -= 128;
96  ymm1 =
97  _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 1 * 32));
98  ymm2 =
99  _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 2 * 32));
100  ymm3 =
101  _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 3 * 32));
102  src = (const u8 *) src + 128;
103  _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 0 * 32), ymm0);
104  _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 1 * 32), ymm1);
105  _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 2 * 32), ymm2);
106  _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 3 * 32), ymm3);
107  dst = (u8 *) dst + 128;
108  }
109 }
110 
111 static inline void *
112 clib_memcpy_fast (void *dst, const void *src, size_t n)
113 {
114  uword dstu = (uword) dst;
115  uword srcu = (uword) src;
116  void *ret = dst;
117  size_t dstofss;
118  size_t bits;
119 
120  /**
121  * Copy less than 16 bytes
122  */
123  if (n < 16)
124  {
125  if (n & 0x01)
126  {
127  *(u8 *) dstu = *(const u8 *) srcu;
128  srcu = (uword) ((const u8 *) srcu + 1);
129  dstu = (uword) ((u8 *) dstu + 1);
130  }
131  if (n & 0x02)
132  {
133  *(u16 *) dstu = *(const u16 *) srcu;
134  srcu = (uword) ((const u16 *) srcu + 1);
135  dstu = (uword) ((u16 *) dstu + 1);
136  }
137  if (n & 0x04)
138  {
139  *(u32 *) dstu = *(const u32 *) srcu;
140  srcu = (uword) ((const u32 *) srcu + 1);
141  dstu = (uword) ((u32 *) dstu + 1);
142  }
143  if (n & 0x08)
144  {
145  *(u64 *) dstu = *(const u64 *) srcu;
146  }
147  return ret;
148  }
149 
150  /**
151  * Fast way when copy size doesn't exceed 512 bytes
152  */
153  if (n <= 32)
154  {
155  clib_mov16 ((u8 *) dst, (const u8 *) src);
156  clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
157  return ret;
158  }
159  if (n <= 48)
160  {
161  clib_mov16 ((u8 *) dst, (const u8 *) src);
162  clib_mov16 ((u8 *) dst + 16, (const u8 *) src + 16);
163  clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
164  return ret;
165  }
166  if (n <= 64)
167  {
168  clib_mov32 ((u8 *) dst, (const u8 *) src);
169  clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
170  return ret;
171  }
172  if (n <= 256)
173  {
174  if (n >= 128)
175  {
176  n -= 128;
177  clib_mov128 ((u8 *) dst, (const u8 *) src);
178  src = (const u8 *) src + 128;
179  dst = (u8 *) dst + 128;
180  }
181  COPY_BLOCK_128_BACK31:
182  if (n >= 64)
183  {
184  n -= 64;
185  clib_mov64 ((u8 *) dst, (const u8 *) src);
186  src = (const u8 *) src + 64;
187  dst = (u8 *) dst + 64;
188  }
189  if (n > 32)
190  {
191  clib_mov32 ((u8 *) dst, (const u8 *) src);
192  clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
193  return ret;
194  }
195  if (n > 0)
196  {
197  clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
198  }
199  return ret;
200  }
201 
202  /**
203  * Make store aligned when copy size exceeds 256 bytes
204  */
205  dstofss = (uword) dst & 0x1F;
206  if (dstofss > 0)
207  {
208  dstofss = 32 - dstofss;
209  n -= dstofss;
210  clib_mov32 ((u8 *) dst, (const u8 *) src);
211  src = (const u8 *) src + dstofss;
212  dst = (u8 *) dst + dstofss;
213  }
214 
215  /**
216  * Copy 128-byte blocks.
217  */
218  clib_mov128blocks ((u8 *) dst, (const u8 *) src, n);
219  bits = n;
220  n = n & 127;
221  bits -= n;
222  src = (const u8 *) src + bits;
223  dst = (u8 *) dst + bits;
224 
225  /**
226  * Copy whatever left
227  */
228  goto COPY_BLOCK_128_BACK31;
229 }
230 
231 
232 #endif /* included_clib_memcpy_avx2_h */
233 
234 
235 /*
236  * fd.io coding-style-patch-verification: ON
237  *
238  * Local Variables:
239  * eval: (c-set-style "gnu")
240  * End:
241  */
unsigned long u64
Definition: types.h:89
vl_api_address_t src
Definition: gre.api:60
unsigned char u8
Definition: types.h:56
unsigned int u32
Definition: types.h:88
unsigned short u16
Definition: types.h:57
static void clib_mov32(u8 *dst, const u8 *src)
Definition: memcpy_avx2.h:64
vl_api_address_t dst
Definition: gre.api:61
static void clib_mov16(u8 *dst, const u8 *src)
Definition: memcpy_avx2.h:55
static void clib_mov128(u8 *dst, const u8 *src)
Definition: memcpy_avx2.h:80
u64 uword
Definition: types.h:112
static void * clib_memcpy_fast(void *dst, const void *src, size_t n)
Definition: memcpy_avx2.h:112
static void clib_mov128blocks(u8 *dst, const u8 *src, size_t n)
Definition: memcpy_avx2.h:87
static void clib_mov64(u8 *dst, const u8 *src)
Definition: memcpy_avx2.h:73