FD.io VPP  v20.09-64-g4f7b92f0a
Vector Packet Processing
ghash.h
Go to the documentation of this file.
1 /*
2  *------------------------------------------------------------------
3  * Copyright (c) 2019 Cisco and/or its affiliates.
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *------------------------------------------------------------------
16  */
17 
18 /*
19  *------------------------------------------------------------------
20  * Copyright(c) 2018, Intel Corporation All rights reserved.
21  *
22  * Redistribution and use in source and binary forms, with or without
23  * modification, are permitted provided that the following conditions
24  * are met:
25  * * Redistributions of source code must retain the above copyright
26  * notice, this list of conditions and the following disclaimer.
27  * * Redistributions in binary form must reproduce the above copyright
28  * notice, this list of conditions and the following disclaimer in
29  * the documentation and/or other materials provided with the
30  * distribution.
31  * * Neither the name of Intel Corporation nor the names of its
32  * contributors may be used to endorse or promote products derived
33  * from this software without specific prior written permission.
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
42  * DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46  *------------------------------------------------------------------
47  */
48 
49 /*
50  * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
51  * Vinodh Gopal, James Guilford, Tomasz Kantecki
52  *
53  * References:
54  * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
55  * Intel Architecture Processors. August, 2010
56  * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
57  * Intel Architecture Processors. October, 2012.
58  * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
59  *
60  * Definitions:
61  * GF Galois Extension Field GF(2^128) - finite field where elements are
62  * represented as polynomials with coefficients in GF(2) with the
63  * highest degree of 127. Polynomials are represented as 128-bit binary
64  * numbers where each bit represents one coefficient.
65  * e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
66  * H hash key (128 bit)
67  * POLY irreducible polynomial x^127 + x^7 + x^2 + x + 1
68  * RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
69  * + addition in GF, which equals to XOR operation
70  * * multiplication in GF
71  *
72  * GF multiplication consists of 2 steps:
73  * - carry-less multiplication of two 128-bit operands into 256-bit result
74  * - reduction of 256-bit result into 128-bit with modulo POLY
75  *
76  * GHash is calculated on 128-bit blocks of data according to the following
77  * formula:
78  * GH = (GH + data) * hash_key
79  *
80  * To avoid bit-reflection of data, this code uses GF multipication
81  * with reversed polynomial:
82  * a * b * x^-127 mod RPOLY
83  *
84  * To improve computation speed table Hi is precomputed with powers of H',
85  * where H' is calculated as H<<1 mod RPOLY.
86  * This allows us to improve performance by deferring reduction. For example
87  * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
88  *
89  * __i128 Hi[4];
90  * ghash_precompute (H, Hi, 4);
91  *
92  * ghash_data_t _gd, *gd = &_gd;
93  * ghash_mul_first (gd, GH ^ b0, Hi[3]);
94  * ghash_mul_next (gd, b1, Hi[2]);
95  * ghash_mul_next (gd, b2, Hi[1]);
96  * ghash_mul_next (gd, b3, Hi[0]);
97  * ghash_reduce (gd);
98  * ghash_reduce2 (gd);
99  * GH = ghash_final (gd);
100  *
101  * Reduction step is split into 3 functions so it can be better interleaved
102  * with other code, (i.e. with AES computation).
103  */
104 
105 #ifndef __ghash_h__
106 #define __ghash_h__
107 
109 gmul_lo_lo (u8x16 a, u8x16 b)
110 {
111 #if defined (__PCLMUL__)
112  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
113 #elif defined (__ARM_FEATURE_CRYPTO)
114  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
115  (poly64_t) vget_low_p64 ((poly64x2_t) b));
116 #endif
117 }
118 
120 gmul_hi_lo (u8x16 a, u8x16 b)
121 {
122 #if defined (__PCLMUL__)
123  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
124 #elif defined (__ARM_FEATURE_CRYPTO)
125  return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
126  (poly64_t) vget_low_p64 ((poly64x2_t) b));
127 #endif
128 }
129 
131 gmul_lo_hi (u8x16 a, u8x16 b)
132 {
133 #if defined (__PCLMUL__)
134  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
135 #elif defined (__ARM_FEATURE_CRYPTO)
136  return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
137  (poly64_t) vget_high_p64 ((poly64x2_t) b));
138 #endif
139 }
140 
142 gmul_hi_hi (u8x16 a, u8x16 b)
143 {
144 #if defined (__PCLMUL__)
145  return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
146 #elif defined (__ARM_FEATURE_CRYPTO)
147  return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
148 #endif
149 }
150 
151 typedef struct
152 {
153  u8x16 mid, hi, lo, tmp_lo, tmp_hi;
154  int pending;
155 } ghash_data_t;
156 
157 static const u8x16 ghash_poly = {
158  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
159  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
160 };
161 
162 static const u8x16 ghash_poly2 = {
163  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
164  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
165 };
166 
168 ghash_mul_first (ghash_data_t * gd, u8x16 a, u8x16 b)
169 {
170  /* a1 * b1 */
171  gd->hi = gmul_hi_hi (a, b);
172  /* a0 * b0 */
173  gd->lo = gmul_lo_lo (a, b);
174  /* a0 * b1 ^ a1 * b0 */
175  gd->mid = (gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b));
176 
177  /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
178  there is no pending data in tmp_lo and tmp_hi */
179  gd->pending = 0;
180 }
181 
183 ghash_mul_next (ghash_data_t * gd, u8x16 a, u8x16 b)
184 {
185  /* a1 * b1 */
186  u8x16 hi = gmul_hi_hi (a, b);
187  /* a0 * b0 */
188  u8x16 lo = gmul_lo_lo (a, b);
189 
190  /* this branch will be optimized out by the compiler, and it allows us to
191  reduce number of XOR operations by using ternary logic */
192  if (gd->pending)
193  {
194  /* there is peding data from previous invocation so we can XOR */
195  gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
196  gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
197  gd->pending = 0;
198  }
199  else
200  {
201  /* there is no peding data from previous invocation so we postpone XOR */
202  gd->tmp_hi = hi;
203  gd->tmp_lo = lo;
204  gd->pending = 1;
205  }
206 
207  /* gd->mid ^= a0 * b1 ^ a1 * b0 */
208  gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
209 }
210 
213 {
214  u8x16 r;
215 
216  /* Final combination:
217  gd->lo ^= gd->mid << 64
218  gd->hi ^= gd->mid >> 64 */
219  u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
220  u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
221 
222  if (gd->pending)
223  {
224  gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
225  gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
226  }
227  else
228  {
229  gd->lo ^= midl;
230  gd->hi ^= midr;
231  }
232  r = gmul_hi_lo (ghash_poly2, gd->lo);
233  gd->lo ^= u8x16_word_shift_left (r, 8);
234 }
235 
238 {
239  gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
240  gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
241 }
242 
245 {
246  return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
247  u8x16_word_shift_left (gd->tmp_hi, 4));
248 }
249 
251 ghash_mul (u8x16 a, u8x16 b)
252 {
253  ghash_data_t _gd, *gd = &_gd;
254  ghash_mul_first (gd, a, b);
255  ghash_reduce (gd);
256  ghash_reduce2 (gd);
257  return ghash_final (gd);
258 }
259 
260 #ifdef __VPCLMULQDQ__
261 
262 static const u8x64 ghash4_poly2 = {
263  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
264  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
265  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
266  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
267  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
268  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
269  0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
270  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
271 };
272 
273 typedef struct
274 {
275  u8x64 hi, lo, mid, tmp_lo, tmp_hi;
276  int pending;
277 } ghash4_data_t;
278 
280 gmul4_lo_lo (u8x64 a, u8x64 b)
281 {
282  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
283 }
284 
286 gmul4_hi_lo (u8x64 a, u8x64 b)
287 {
288  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
289 }
290 
292 gmul4_lo_hi (u8x64 a, u8x64 b)
293 {
294  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
295 }
296 
298 gmul4_hi_hi (u8x64 a, u8x64 b)
299 {
300  return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
301 }
302 
303 
305 ghash4_mul_first (ghash4_data_t * gd, u8x64 a, u8x64 b)
306 {
307  gd->hi = gmul4_hi_hi (a, b);
308  gd->lo = gmul4_lo_lo (a, b);
309  gd->mid = (gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b));
310  gd->pending = 0;
311 }
312 
314 ghash4_mul_next (ghash4_data_t * gd, u8x64 a, u8x64 b)
315 {
316  u8x64 hi = gmul4_hi_hi (a, b);
317  u8x64 lo = gmul4_lo_lo (a, b);
318 
319  if (gd->pending)
320  {
321  /* there is peding data from previous invocation so we can XOR */
322  gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, hi);
323  gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, lo);
324  gd->pending = 0;
325  }
326  else
327  {
328  /* there is no peding data from previous invocation so we postpone XOR */
329  gd->tmp_hi = hi;
330  gd->tmp_lo = lo;
331  gd->pending = 1;
332  }
333  gd->mid = u8x64_xor3 (gd->mid, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
334 }
335 
337 ghash4_reduce (ghash4_data_t * gd)
338 {
339  u8x64 r;
340 
341  /* Final combination:
342  gd->lo ^= gd->mid << 64
343  gd->hi ^= gd->mid >> 64 */
344 
345  u8x64 midl = u8x64_word_shift_left (gd->mid, 8);
346  u8x64 midr = u8x64_word_shift_right (gd->mid, 8);
347 
348  if (gd->pending)
349  {
350  gd->lo = u8x64_xor3 (gd->lo, gd->tmp_lo, midl);
351  gd->hi = u8x64_xor3 (gd->hi, gd->tmp_hi, midr);
352  }
353  else
354  {
355  gd->lo ^= midl;
356  gd->hi ^= midr;
357  }
358 
359  r = gmul4_hi_lo (ghash4_poly2, gd->lo);
360  gd->lo ^= u8x64_word_shift_left (r, 8);
361 
362 }
363 
365 ghash4_reduce2 (ghash4_data_t * gd)
366 {
367  gd->tmp_lo = gmul4_lo_lo (ghash4_poly2, gd->lo);
368  gd->tmp_hi = gmul4_lo_hi (ghash4_poly2, gd->lo);
369 }
370 
372 ghash4_final (ghash4_data_t * gd)
373 {
374  u8x64 r;
375  u8x32 t;
376 
377  r = u8x64_xor3 (gd->hi, u8x64_word_shift_right (gd->tmp_lo, 4),
378  u8x64_word_shift_left (gd->tmp_hi, 4));
379 
380  /* horizontal XOR of 4 128-bit lanes */
381  t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
382  return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
383 }
384 #endif
385 
387 ghash_precompute (u8x16 H, u8x16 * Hi, int n)
388 {
389  u8x16 r8;
390  u32x4 r32;
391  /* calcullate H<<1 mod poly from the hash key */
392  r8 = (u8x16) ((u64x2) H >> 63);
393  H = (u8x16) ((u64x2) H << 1);
394  H |= u8x16_word_shift_left (r8, 8);
395  r32 = (u32x4) u8x16_word_shift_right (r8, 8);
396 #ifdef __SSE2__
397  r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
398 #else
399  r32[3] = r32[0];
400 #endif
401  /* *INDENT-OFF* */
402  r32 = r32 == (u32x4) {1, 0, 0, 1};
403  /* *INDENT-ON* */
404  Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
405 
406  /* calculate H^(i + 1) */
407  for (int i = n - 2; i >= 0; i--)
408  Hi[i] = ghash_mul (H, Hi[i + 1]);
409 }
410 
411 #endif /* __ghash_h__ */
412 
413 /*
414  * fd.io coding-style-patch-verification: ON
415  *
416  * Local Variables:
417  * eval: (c-set-style "gnu")
418  * End:
419  */
#define u8x64_word_shift_left(a, n)
u8x16 tmp_hi
Definition: ghash.h:153
a
Definition: bitmap.h:538
u8x16 hi
Definition: ghash.h:153
static const u8x16 ghash_poly2
Definition: ghash.h:162
#define u8x16_word_shift_left(x, n)
Definition: vector_neon.h:191
u8x16 lo
Definition: ghash.h:153
#define u8x64_word_shift_right(a, n)
static_always_inline u8x32 u8x64_extract_hi(u8x64 v)
static_always_inline void ghash_precompute(u8x16 H, u8x16 *Hi, int n)
Definition: ghash.h:387
static_always_inline u8x16 ghash_final(ghash_data_t *gd)
Definition: ghash.h:244
#define static_always_inline
Definition: clib.h:108
static_always_inline void ghash_reduce(ghash_data_t *gd)
Definition: ghash.h:212
static_always_inline void ghash_mul_next(ghash_data_t *gd, u8x16 a, u8x16 b)
Definition: ghash.h:183
static_always_inline void ghash_reduce2(ghash_data_t *gd)
Definition: ghash.h:237
static_always_inline u8x64 u8x64_xor3(u8x64 a, u8x64 b, u8x64 c)
epu8_epi32 epu16_epi32 u64x2
Definition: vector_sse42.h:691
static_always_inline u8x16 gmul_lo_hi(u8x16 a, u8x16 b)
Definition: ghash.h:131
lo
static_always_inline u8x16 gmul_lo_lo(u8x16 a, u8x16 b)
Definition: ghash.h:109
u8x16 mid
Definition: ghash.h:153
static_always_inline u8x16 ghash_mul(u8x16 a, u8x16 b)
Definition: ghash.h:251
static_always_inline void ghash_mul_first(ghash_data_t *gd, u8x16 a, u8x16 b)
Definition: ghash.h:168
static_always_inline u32x4 u32x4_shuffle(u32x4 v, const int a, const int b, const int c, const int d)
Definition: vector_sse42.h:668
static_always_inline u8x16 u8x16_xor3(u8x16 a, u8x16 b, u8x16 c)
Definition: vector_neon.h:204
static_always_inline u8x16 gmul_hi_lo(u8x16 a, u8x16 b)
Definition: ghash.h:120
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:317
static const u8x16 ghash_poly
Definition: ghash.h:157
static_always_inline u8x32 u8x64_extract_lo(u8x64 v)
vl_api_ip4_address_t hi
Definition: arp.api:37
static_always_inline u8x16 gmul_hi_hi(u8x16 a, u8x16 b)
Definition: ghash.h:142
#define u8x16_word_shift_right(x, n)
Definition: vector_neon.h:192
int pending
Definition: ghash.h:154
u8x16 tmp_lo
Definition: ghash.h:153
unsigned long long u32x4
Definition: ixge.c:28