1 /*
2  *  Armv8-A Cryptographic Extension support functions for Aarch64
3  *
4  *  Copyright The Mbed TLS Contributors
5  *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6  */
7 
8 #if defined(__clang__) &&  (__clang_major__ >= 4)
9 
10 /* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11  * but that is defined by build_info.h, and we need this block to happen first. */
12 #if defined(__ARM_ARCH)
13 #if __ARM_ARCH >= 8
14 #define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15 #endif
16 #endif
17 
18 #if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19 /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
20  *
21  * The intrinsic declaration are guarded by predefined ACLE macros in clang:
22  * these are normally only enabled by the -march option on the command line.
23  * By defining the macros ourselves we gain access to those declarations without
24  * requiring -march on the command line.
25  *
26  * `arm_neon.h` is included by common.h, so we put these defines
27  * at the top of this file, before any includes.
28  */
29 #define __ARM_FEATURE_CRYPTO 1
30 /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
31  *
32  * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
33  * for older compilers.
34  */
35 #define __ARM_FEATURE_AES    1
36 #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
37 #endif
38 
39 #endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
40 
41 #include <string.h>
42 #include "common.h"
43 
44 #if defined(MBEDTLS_AESCE_C)
45 
46 #include "aesce.h"
47 
48 #if defined(MBEDTLS_AESCE_HAVE_CODE)
49 
50 /* Compiler version checks. */
51 #if defined(__clang__)
52 #   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
53 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
54 #   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
56 #   endif
57 #elif defined(__GNUC__)
58 #   if __GNUC__ < 6
59 #       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
60 #   endif
61 #elif defined(_MSC_VER)
62 /* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63  *       please update this and document of `MBEDTLS_AESCE_C` in
64  *       `mbedtls_config.h`. */
65 #   if _MSC_VER < 1929
66 #       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67 #   endif
68 #elif defined(__ARMCC_VERSION)
69 #    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70 /* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71  * If someone verified that, please update this and document of
72  * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74 #    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76 #    endif
77 #endif
78 
79 #if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
80     defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
81 #   if defined(__ARMCOMPILER_VERSION)
82 #       if __ARMCOMPILER_VERSION <= 6090000
83 #           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
84 #       else
85 #           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
86 #           define MBEDTLS_POP_TARGET_PRAGMA
87 #       endif
88 #   elif defined(__clang__)
89 #       pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
90 #       define MBEDTLS_POP_TARGET_PRAGMA
91 #   elif defined(__GNUC__)
92 #       pragma GCC push_options
93 #       pragma GCC target ("+crypto")
94 #       define MBEDTLS_POP_TARGET_PRAGMA
95 #   elif defined(_MSC_VER)
96 #       error "Required feature(__ARM_FEATURE_AES) is not enabled."
97 #   endif
98 #endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
99           MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
100 
101 #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
102 
103 #include <sys/auxv.h>
104 #if !defined(HWCAP_NEON)
105 #define HWCAP_NEON  (1 << 12)
106 #endif
107 #if !defined(HWCAP2_AES)
108 #define HWCAP2_AES  (1 << 0)
109 #endif
110 #if !defined(HWCAP_AES)
111 #define HWCAP_AES   (1 << 3)
112 #endif
113 #if !defined(HWCAP_ASIMD)
114 #define HWCAP_ASIMD (1 << 1)
115 #endif
116 
117 signed char mbedtls_aesce_has_support_result = -1;
118 
119 #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
120 /*
121  * AES instruction support detection routine
122  */
mbedtls_aesce_has_support_impl(void)123 int mbedtls_aesce_has_support_impl(void)
124 {
125     /* To avoid many calls to getauxval, cache the result. This is
126      * thread-safe, because we store the result in a char so cannot
127      * be vulnerable to non-atomic updates.
128      * It is possible that we could end up setting result more than
129      * once, but that is harmless.
130      */
131     if (mbedtls_aesce_has_support_result == -1) {
132 #if defined(MBEDTLS_ARCH_IS_ARM32)
133         unsigned long auxval  = getauxval(AT_HWCAP);
134         unsigned long auxval2 = getauxval(AT_HWCAP2);
135         if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
136             ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
137             mbedtls_aesce_has_support_result = 1;
138         } else {
139             mbedtls_aesce_has_support_result = 0;
140         }
141 #else
142         unsigned long auxval = getauxval(AT_HWCAP);
143         if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
144             (HWCAP_ASIMD | HWCAP_AES)) {
145             mbedtls_aesce_has_support_result = 1;
146         } else {
147             mbedtls_aesce_has_support_result = 0;
148         }
149 #endif
150     }
151     return mbedtls_aesce_has_support_result;
152 }
153 #endif
154 
155 #endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
156 
157 /* Single round of AESCE encryption */
158 #define AESCE_ENCRYPT_ROUND                   \
159     block = vaeseq_u8(block, vld1q_u8(keys)); \
160     block = vaesmcq_u8(block);                \
161     keys += 16
162 /* Two rounds of AESCE encryption */
163 #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
164 
165 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
aesce_encrypt_block(uint8x16_t block,unsigned char * keys,int rounds)166 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
167                                       unsigned char *keys,
168                                       int rounds)
169 {
170     /* 10, 12 or 14 rounds. Unroll loop. */
171     if (rounds == 10) {
172         goto rounds_10;
173     }
174     if (rounds == 12) {
175         goto rounds_12;
176     }
177     AESCE_ENCRYPT_ROUND_X2;
178 rounds_12:
179     AESCE_ENCRYPT_ROUND_X2;
180 rounds_10:
181     AESCE_ENCRYPT_ROUND_X2;
182     AESCE_ENCRYPT_ROUND_X2;
183     AESCE_ENCRYPT_ROUND_X2;
184     AESCE_ENCRYPT_ROUND_X2;
185     AESCE_ENCRYPT_ROUND;
186 
187     /* AES AddRoundKey for the previous round.
188      * SubBytes, ShiftRows for the final round.  */
189     block = vaeseq_u8(block, vld1q_u8(keys));
190     keys += 16;
191 
192     /* Final round: no MixColumns */
193 
194     /* Final AddRoundKey */
195     block = veorq_u8(block, vld1q_u8(keys));
196 
197     return block;
198 }
199 
200 /* Single round of AESCE decryption
201  *
202  * AES AddRoundKey, SubBytes, ShiftRows
203  *
204  *      block = vaesdq_u8(block, vld1q_u8(keys));
205  *
206  * AES inverse MixColumns for the next round.
207  *
208  * This means that we switch the order of the inverse AddRoundKey and
209  * inverse MixColumns operations. We have to do this as AddRoundKey is
210  * done in an atomic instruction together with the inverses of SubBytes
211  * and ShiftRows.
212  *
213  * It works because MixColumns is a linear operation over GF(2^8) and
214  * AddRoundKey is an exclusive or, which is equivalent to addition over
215  * GF(2^8). (The inverse of MixColumns needs to be applied to the
216  * affected round keys separately which has been done when the
217  * decryption round keys were calculated.)
218  *
219  *      block = vaesimcq_u8(block);
220  */
221 #define AESCE_DECRYPT_ROUND                   \
222     block = vaesdq_u8(block, vld1q_u8(keys)); \
223     block = vaesimcq_u8(block);               \
224     keys += 16
225 /* Two rounds of AESCE decryption */
226 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
227 
228 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
aesce_decrypt_block(uint8x16_t block,unsigned char * keys,int rounds)229 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
230                                       unsigned char *keys,
231                                       int rounds)
232 {
233     /* 10, 12 or 14 rounds. Unroll loop. */
234     if (rounds == 10) {
235         goto rounds_10;
236     }
237     if (rounds == 12) {
238         goto rounds_12;
239     }
240     AESCE_DECRYPT_ROUND_X2;
241 rounds_12:
242     AESCE_DECRYPT_ROUND_X2;
243 rounds_10:
244     AESCE_DECRYPT_ROUND_X2;
245     AESCE_DECRYPT_ROUND_X2;
246     AESCE_DECRYPT_ROUND_X2;
247     AESCE_DECRYPT_ROUND_X2;
248     AESCE_DECRYPT_ROUND;
249 
250     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
251      * last full round. */
252     block = vaesdq_u8(block, vld1q_u8(keys));
253     keys += 16;
254 
255     /* Inverse AddRoundKey for inverting the initial round key addition. */
256     block = veorq_u8(block, vld1q_u8(keys));
257 
258     return block;
259 }
260 #endif
261 
262 /*
263  * AES-ECB block en(de)cryption
264  */
mbedtls_aesce_crypt_ecb(mbedtls_aes_context * ctx,int mode,const unsigned char input[16],unsigned char output[16])265 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
266                             int mode,
267                             const unsigned char input[16],
268                             unsigned char output[16])
269 {
270     uint8x16_t block = vld1q_u8(&input[0]);
271     unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
272 
273 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
274     if (mode == MBEDTLS_AES_DECRYPT) {
275         block = aesce_decrypt_block(block, keys, ctx->nr);
276     } else
277 #else
278     (void) mode;
279 #endif
280     {
281         block = aesce_encrypt_block(block, keys, ctx->nr);
282     }
283     vst1q_u8(&output[0], block);
284 
285     return 0;
286 }
287 
288 /*
289  * Compute decryption round keys from encryption round keys
290  */
291 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
mbedtls_aesce_inverse_key(unsigned char * invkey,const unsigned char * fwdkey,int nr)292 void mbedtls_aesce_inverse_key(unsigned char *invkey,
293                                const unsigned char *fwdkey,
294                                int nr)
295 {
296     int i, j;
297     j = nr;
298     vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
299     for (i = 1, j--; j > 0; i++, j--) {
300         vst1q_u8(invkey + i * 16,
301                  vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
302     }
303     vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
304 
305 }
306 #endif
307 
aes_rot_word(uint32_t word)308 static inline uint32_t aes_rot_word(uint32_t word)
309 {
310     return (word << (32 - 8)) | (word >> 8);
311 }
312 
aes_sub_word(uint32_t in)313 static inline uint32_t aes_sub_word(uint32_t in)
314 {
315     uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
316     uint8x16_t zero = vdupq_n_u8(0);
317 
318     /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
319      * the correct result as ShiftRows doesn't change the first row. */
320     v = vaeseq_u8(zero, v);
321     return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
322 }
323 
324 /*
325  * Key expansion function
326  */
aesce_setkey_enc(unsigned char * rk,const unsigned char * key,const size_t key_bit_length)327 static void aesce_setkey_enc(unsigned char *rk,
328                              const unsigned char *key,
329                              const size_t key_bit_length)
330 {
331     static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
332                                     0x20, 0x40, 0x80, 0x1b, 0x36 };
333     /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
334      *   - Section 5, Nr = Nk + 6
335      *   - Section 5.2, the length of round keys is Nb*(Nr+1)
336      */
337     const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
338     const size_t round_key_len_in_words = 4;                /* Nb */
339     const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
340     const size_t round_keys_len_in_words =
341         round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
342     const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
343 
344     memcpy(rk, key, key_len_in_words * 4);
345 
346     for (uint32_t *rki = (uint32_t *) rk;
347          rki + key_len_in_words < rko_end;
348          rki += key_len_in_words) {
349 
350         size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
351         uint32_t *rko;
352         rko = rki + key_len_in_words;
353         rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
354         rko[0] ^= rcon[iteration] ^ rki[0];
355         rko[1] = rko[0] ^ rki[1];
356         rko[2] = rko[1] ^ rki[2];
357         rko[3] = rko[2] ^ rki[3];
358         if (rko + key_len_in_words > rko_end) {
359             /* Do not write overflow words.*/
360             continue;
361         }
362 #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
363         switch (key_bit_length) {
364             case 128:
365                 break;
366             case 192:
367                 rko[4] = rko[3] ^ rki[4];
368                 rko[5] = rko[4] ^ rki[5];
369                 break;
370             case 256:
371                 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
372                 rko[5] = rko[4] ^ rki[5];
373                 rko[6] = rko[5] ^ rki[6];
374                 rko[7] = rko[6] ^ rki[7];
375                 break;
376         }
377 #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
378     }
379 }
380 
381 /*
382  * Key expansion, wrapper
383  */
mbedtls_aesce_setkey_enc(unsigned char * rk,const unsigned char * key,size_t bits)384 int mbedtls_aesce_setkey_enc(unsigned char *rk,
385                              const unsigned char *key,
386                              size_t bits)
387 {
388     switch (bits) {
389         case 128:
390         case 192:
391         case 256:
392             aesce_setkey_enc(rk, key, bits);
393             break;
394         default:
395             return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
396     }
397 
398     return 0;
399 }
400 
401 #if defined(MBEDTLS_GCM_C)
402 
403 #if defined(MBEDTLS_ARCH_IS_ARM32)
404 
405 #if defined(__clang__)
406 /* On clang for A32/T32, work around some missing intrinsics and types which are listed in
407  * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
408  * These are only required for GCM.
409  */
410 #define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
411 
412 typedef uint8x16_t poly128_t;
413 
vmull_p64(poly64_t a,poly64_t b)414 static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
415 {
416     poly128_t r;
417     asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
418     return r;
419 }
420 
421 /* This is set to cause some more missing intrinsics to be defined below */
422 #define COMMON_MISSING_INTRINSICS
423 
vmull_high_p64(poly64x2_t a,poly64x2_t b)424 static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
425 {
426     return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
427                      (poly64_t) (vget_high_u64((uint64x2_t) b)));
428 }
429 
430 #endif /* defined(__clang__) */
431 
vrbitq_u8(uint8x16_t x)432 static inline uint8x16_t vrbitq_u8(uint8x16_t x)
433 {
434     /* There is no vrbitq_u8 instruction in A32/T32, so provide
435      * an equivalent non-Neon implementation. Reverse bit order in each
436      * byte with 4x rbit, rev. */
437     asm ("ldm  %[p], { r2-r5 } \n\t"
438          "rbit r2, r2          \n\t"
439          "rev  r2, r2          \n\t"
440          "rbit r3, r3          \n\t"
441          "rev  r3, r3          \n\t"
442          "rbit r4, r4          \n\t"
443          "rev  r4, r4          \n\t"
444          "rbit r5, r5          \n\t"
445          "rev  r5, r5          \n\t"
446          "stm  %[p], { r2-r5 } \n\t"
447          :
448          /* Output: 16 bytes of memory pointed to by &x */
449          "+m" (*(uint8_t(*)[16]) &x)
450          :
451          [p] "r" (&x)
452          :
453          "r2", "r3", "r4", "r5"
454          );
455     return x;
456 }
457 
458 #endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
459 
460 #if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
461 /* Some intrinsics are not available for GCC 5.X. */
462 #define COMMON_MISSING_INTRINSICS
463 #endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
464 
465 
466 #if defined(COMMON_MISSING_INTRINSICS)
467 
468 /* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
469 
470 #define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
471 #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
472 
vget_low_p64(poly64x2_t a)473 static inline poly64x1_t vget_low_p64(poly64x2_t a)
474 {
475     uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
476     return (poly64x1_t) r;
477 
478 }
479 
480 #endif /* COMMON_MISSING_INTRINSICS */
481 
482 /* vmull_p64/vmull_high_p64 wrappers.
483  *
484  * Older compilers miss some intrinsic functions for `poly*_t`. We use
485  * uint8x16_t and uint8x16x3_t as input/output parameters.
486  */
487 #if defined(MBEDTLS_COMPILER_IS_GCC)
488 /* GCC reports incompatible type error without cast. GCC think poly64_t and
489  * poly64x1_t are different, that is different with MSVC and Clang. */
490 #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
491 #else
492 /* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
493  * error with/without cast. And I think poly64_t and poly64x1_t are same, no
494  * cast for clang also. */
495 #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
496 #endif /* MBEDTLS_COMPILER_IS_GCC */
497 
pmull_low(uint8x16_t a,uint8x16_t b)498 static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
499 {
500 
501     return vreinterpretq_u8_p128(
502         MBEDTLS_VMULL_P64(
503             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
504             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
505             ));
506 }
507 
pmull_high(uint8x16_t a,uint8x16_t b)508 static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
509 {
510     return vreinterpretq_u8_p128(
511         vmull_high_p64(vreinterpretq_p64_u8(a),
512                        vreinterpretq_p64_u8(b)));
513 }
514 
515 /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
516  * `x^128 + x^7 + x^2 + x + 1`.
517  *
518  * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
519  * multiplies to generate a 128b.
520  *
521  * `poly_mult_128` executes polynomial multiplication and outputs 256b that
522  * represented by 3 128b due to code size optimization.
523  *
524  * Output layout:
525  * |            |             |             |
526  * |------------|-------------|-------------|
527  * | ret.val[0] | h3:h2:00:00 | high   128b |
528  * | ret.val[1] |   :m2:m1:00 | middle 128b |
529  * | ret.val[2] |   :  :l1:l0 | low    128b |
530  */
poly_mult_128(uint8x16_t a,uint8x16_t b)531 static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
532 {
533     uint8x16x3_t ret;
534     uint8x16_t h, m, l; /* retval high/middle/low */
535     uint8x16_t c, d, e;
536 
537     h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
538     l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
539     c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
540     d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
541     e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
542     m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
543 
544     ret.val[0] = h;
545     ret.val[1] = m;
546     ret.val[2] = l;
547     return ret;
548 }
549 
550 /*
551  * Modulo reduction.
552  *
553  * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
554  *
555  * Section 4.3
556  *
557  * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
558  * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
559  * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
560  * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
561  * simply multiply the higher part of the operand by r(z) and add it to l(z). If
562  * the result is still larger than 128 bits, we reduce again.
563  */
poly_mult_reduce(uint8x16x3_t input)564 static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
565 {
566     uint8x16_t const ZERO = vdupq_n_u8(0);
567 
568     uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
569 #if defined(__GNUC__)
570     /* use 'asm' as an optimisation barrier to prevent loading MODULO from
571      * memory. It is for GNUC compatible compilers.
572      */
573     asm volatile ("" : "+w" (r));
574 #endif
575     uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
576     uint8x16_t h, m, l; /* input high/middle/low 128b */
577     uint8x16_t c, d, e, f, g, n, o;
578     h = input.val[0];            /* h3:h2:00:00                          */
579     m = input.val[1];            /*   :m2:m1:00                          */
580     l = input.val[2];            /*   :  :l1:l0                          */
581     c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
582     d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
583     e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
584     f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
585     g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
586     n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
587     o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
588     return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
589 }
590 
591 /*
592  * GCM multiplication: c = a times b in GF(2^128)
593  */
mbedtls_aesce_gcm_mult(unsigned char c[16],const unsigned char a[16],const unsigned char b[16])594 void mbedtls_aesce_gcm_mult(unsigned char c[16],
595                             const unsigned char a[16],
596                             const unsigned char b[16])
597 {
598     uint8x16_t va, vb, vc;
599     va = vrbitq_u8(vld1q_u8(&a[0]));
600     vb = vrbitq_u8(vld1q_u8(&b[0]));
601     vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
602     vst1q_u8(&c[0], vc);
603 }
604 
605 #endif /* MBEDTLS_GCM_C */
606 
607 #if defined(MBEDTLS_POP_TARGET_PRAGMA)
608 #if defined(__clang__)
609 #pragma clang attribute pop
610 #elif defined(__GNUC__)
611 #pragma GCC pop_options
612 #endif
613 #undef MBEDTLS_POP_TARGET_PRAGMA
614 #endif
615 
616 #endif /* MBEDTLS_AESCE_HAVE_CODE */
617 
618 #endif /* MBEDTLS_AESCE_C */
619