1 /*
2 * Copyright 2018-2021 NXP
3 * All rights reserved.
4 *
5 *
6 * SPDX-License-Identifier: BSD-3-Clause
7 */
8
9 #include "fsl_casper.h"
10 #include <math.h> /* ceil TODO check if really need it */
11
12 /*******************************************************************************
13 * Definitions
14 ******************************************************************************/
15
16 /* Component ID definition, used by tools. */
17 #ifndef FSL_COMPONENT_ID
18 #define FSL_COMPONENT_ID "platform.drivers.casper"
19 #endif
20
21 /* Recoding length for the secure scalar multiplication:
22 * Use n=256 and w=4 --> compute ciel(384/3) = 86 + 1 digits
23 * Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits
24 * Use n=521 and w=4 --> compute ciel(521/3) = 174 + 1 digits
25 */
26
27 /*!< Recoding length for the secure scalar multiplication */
28 enum _casper_ecc_recode_len
29 {
30 kCASPER_ECC_P256_recode_len = 87u,
31 kCASPER_ECC_P384_recode_len = 129u,
32 kCASPER_ECC_P521_recode_len = 175u,
33 };
34
35 enum _casper_ecc_N_bitlen
36 {
37 kCASPER_ECC_P256_N_bitlen = 256u,
38 kCASPER_ECC_P384_N_bitlen = 384u,
39 kCASPER_ECC_P521_N_bitlen = 576u,
40 };
41
42 enum _casper_ecc_N_wordlen
43 {
44 kCASPER_ECC_P256_wordlen = 256U / 32U,
45 kCASPER_ECC_P384_wordlen = 384u / 32U,
46 kCASPER_ECC_P521_wordlen = 576u / 32U,
47 };
48
49 #if defined(__GNUC__)
50 /* Enforce O1 optimize level, specifically to remove strict-aliasing option.
51 (-fno-strict-aliasing is required for this driver). */
52 #pragma GCC push_options
53 #pragma GCC optimize("-O1")
54 #endif
55
56 #if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
57 /* Enforce optimization off for clang, specifically to remove strict-aliasing option.
58 (-fno-strict-aliasing is required for this driver). */
59 #pragma clang optimize off
60 #endif
61
62 /* CASPER driver allows usage of 256, 384 and 521 ECC */
63 #define CASPER_MAX_ECC_SIZE_WORDLEN (576u / 32U)
64 #define CASPER_RECODE_LENGTH_MAX 175
65
66 #define CASPER_RAM_BASE_NS (FSL_FEATURE_CASPER_RAM_BASE_ADDRESS)
67
68 #if defined(FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED) && FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED
69 #define CASPER_RAM_OFFSET (FSL_FEATURE_CASPER_RAM_OFFSET)
70 #define INTERLEAVE(addr) \
71 (((((((addr) >> 2U) & 0x00000001U) << CASPER_RAM_OFFSET) + (((addr) >> 3U) << 2U) + ((addr)&0x00000003U)) & \
72 0xFFFFU) | \
73 s_casperRamBase)
74 #define DEINTERLEAVE(addr) INTERLEAVE(addr)
75 #define GET_WORD(addr) (*((uint32_t *)DEINTERLEAVE((uint32_t)(addr))))
76 #define GET_DWORD(addr) (((uint64_t)GET_WORD(addr)) | (((uint64_t)GET_WORD(((uint32_t)(addr)) + 4U)) << 32U))
77 #define SET_WORD(addr, value) *((uint32_t *)INTERLEAVE((uint32_t)(addr))) = ((uint32_t)(value))
78 #define SET_DWORD(addr, value) \
79 do \
80 { \
81 SET_WORD(addr, (uint32_t)(value & 0xFFFFFFFFU)); \
82 SET_WORD(((uint32_t)(addr)) + 4U, (uint32_t)((value & 0xFFFFFFFF00000000U) >> 32U)); \
83 } while (false)
84
85 /* memcopy is always word aligned */
86 /* interleaved to interleaved
87 static void CASPER_MEMCPY_I2I(void *dst, const void *src, size_t siz)
88 */
89 #define CASPER_MEMCPY_I2I(dst, src, siz) \
90 \
91 { \
92 uint32_t *dst32 = (uint32_t *)(dst); \
93 const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
94 uint32_t i; \
95 for (i = 0U; i < (siz) / 4U; i++) \
96 { \
97 SET_WORD(&dst32[i], GET_WORD(&src32[i])); \
98 } \
99 }
100
101 /* interleaved to non-interleaved
102 static void CASPER_MEMCPY_I2N(void *dst, const void *src, size_t siz)
103 */
104 #define CASPER_MEMCPY_I2N(dst, src, siz) \
105 \
106 { \
107 uint32_t *dst32 = (uint32_t *)(dst); \
108 const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
109 uint32_t i; \
110 for (i = 0U; i < (siz) / 4U; i++) \
111 { \
112 dst32[i] = GET_WORD(&src32[i]); \
113 } \
114 }
115
116 /* non-interleaved to interleaved
117 static void CASPER_MEMCPY_N2I(void *dst, const void *src, size_t siz)
118 */
119 #define CASPER_MEMCPY_N2I(dst, src, siz) \
120 \
121 { \
122 volatile uint32_t *dst32 = (uint32_t *)(dst); \
123 const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
124 uint32_t i; \
125 for (i = 0U; i < (siz) / 4U; i++) \
126 { \
127 SET_WORD(&dst32[i], src32[i]); \
128 } \
129 }
130 #else
131 #define GET_WORD(addr) (*((uint32_t *)(uint32_t)(addr)))
132 #define GET_DWORD(addr) (*((uint64_t *)(addr)))
133 #define SET_WORD(addr, value) *((uint32_t *)(uint32_t)(addr)) = ((uint32_t)(value))
134 #define SET_DWORD(addr, value) *((uint64_t *)(addr)) = ((uint64_t)(value))
135
136 #define CASPER_MEMCPY_I2I(dst, src, siz) (void)memcpy(dst, src, siz)
137 #define CASPER_MEMCPY_I2N(dst, src, siz) (void)memcpy(dst, src, siz)
138 #define CASPER_MEMCPY_N2I(dst, src, siz) (void)memcpy(dst, src, siz)
139 #endif
140
141 #define WORK_BUFF_MUL4 (N_wordlen_max * 4 + 2) /* ! working buffer is 4xN_wordlen to allow in place math */
142 #define N_bytelen (N_wordlen * 4U) /* for memory copy and the like */
143 #define N_dwordlen (unsigned)(N_wordlen / 2U)
144
145 #define PreZeroW(i, w_out) \
146 for ((i) = 0U; (i) < N_wordlen; (i) += 4U) \
147 { \
148 SET_WORD(&(w_out)[(i) + 0U], 0U); \
149 SET_WORD(&(w_out)[(i) + 1U], 0U); \
150 SET_WORD(&(w_out)[(i) + 2U], 0U); \
151 SET_WORD(&(w_out)[(i) + 3U], 0U); \
152 } /* unrolled partly */
153 #define PreZeroW2up(i, w_out) \
154 for (i = N_wordlen; i <= N_wordlen * 2U; i += 4U) \
155 { \
156 SET_WORD(&w_out[i + 0U], 0U); \
157 SET_WORD(&w_out[i + 1U], 0U); \
158 SET_WORD(&w_out[i + 2U], 0U); \
159 SET_WORD(&w_out[i + 3U], 0U); \
160 } /* unrolled partly */
161
162 /* Macros for the ECC component in Casper */
163
164 /* CASPER memory layout for ECC */
165
166 #define CASPER_MEM ((uint32_t *)msg_ret)
167
168 /* Currently these macros work on 32-bit platforms */
169
170 #define add(c1, c0, a, b) \
171 \
172 do \
173 { \
174 uint32_t _t; \
175 _t = a + b; \
176 c1 = (uint32_t)(_t < a); \
177 c0 = _t; \
178 \
179 } while (false)
180
181 #define add_cout(carry, c, a, b) add((carry), (c), (a), (b))
182
183 #define add_cout_cin(carryout, c, a, b, carryin) \
184 do \
185 { \
186 uint64_t _t = (uint64_t)(a) + (b) + (carryin); \
187 (c) = (uint32_t)_t; \
188 (carryout) = (uint32_t)(_t >> 32); \
189 } while (false)
190
191 #define sub_borrowout(borrow, c, a, b) \
192 do \
193 { \
194 uint32_t _b = (uint32_t)((b) > (a)); \
195 (c) = (a) - (b); \
196 (borrow) = _b; \
197 } while (false)
198
199 #define sub_borrowin_borrowout(borrowout, c, a, b, borrowin) \
200 do \
201 { \
202 uint32_t _t, _borrow1, _borrow2; \
203 sub_borrowout(_borrow1, _t, (a), (b)); \
204 sub_borrowout(_borrow2, (c), _t, (borrowin)); \
205 (borrowout) = _borrow1 + _borrow2; \
206 } while (false)
207
208 #define sub_borrowout_1(borrow, c, a) \
209 do \
210 { \
211 uint32_t _b = 0; \
212 c = a - b; \
213 borrow = _b; \
214 } while (false)
215
216 #define sub_borrowin_borrowout_1(borrowout, c, a, borrowin) \
217 do \
218 { \
219 uint32_t _t, _borrow1, _borrow2; \
220 sub_borrowout_1(_borrow1, _t, a); \
221 sub_borrowout(_borrow2, c, _t, borrowin); \
222 borrowout = _borrow1 + _borrow2; \
223 } while (false)
224
225 /* 32 x 32 --> 64-bit multiplication
226 * (c1,c0) = a * b
227 */
228 #define mul(c1, c0, a, b) \
229 \
230 do \
231 { \
232 uint64_t __m; \
233 __m = (uint64_t)a * (uint64_t)b; \
234 c0 = (uint32_t)__m; \
235 c1 = (uint32_t)(__m >> (uint64_t)32); \
236 \
237 } while (false)
238
239 /* Multiply-and-accumulate
240 * (c1,c0) = a*b+c0
241 */
242 #define muladd(c1, c0, a, b) \
243 \
244 do \
245 { \
246 uint32_t __ma = c0; \
247 mul(c1, c0, a, b); \
248 c0 = c0 + __ma; \
249 c1 = c1 + (c0 < __ma); \
250 \
251 } while (0)
252
253 /* Multiply-and-accumulate-accumulate
254 * (c1,c0) = a*b+c0+c1
255 */
256 #define muladdadd(c1, c0, a, b) \
257 \
258 do \
259 { \
260 uint32_t __maa0 = c0, __maa1 = c1; \
261 mul(c1, c0, a, b); \
262 c0 = c0 + __maa0; \
263 c1 = c1 + (c0 < __maa0); \
264 c0 = c0 + __maa1; \
265 c1 = c1 + (c0 < __maa1); \
266 \
267 } while (0)
268
269 #define square_casper(c, a) multiply_casper(c, a, a)
270 #define sub_casper(c, a, b) CASPER_montsub(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
271 #define add_casper(c, a, b) CASPER_montadd(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
272 #define mul2_casper(c, a) add_casper(c, a, a)
273 #define half(c, a, b) CASPER_half(c, a, b)
274 /*******************************************************************************
275 * Variables
276 ******************************************************************************/
277
278 /* The model for this algo is that it can be implemented for a fixed size RSA key */
279 /* for max speed. If this is made into a variable (to allow varying size), then */
280 /* it will be slower by a bit. */
281 /* The file is compiled with N_bitlen passed in as number of bits of the RSA key */
282 /* #define N_bitlen 2048 */
283 static size_t N_wordlen = 0U; /* ! number of words (e.g. 4096/32 is 128 words) */
284
285 static uint32_t s_casperRamBase = CASPER_RAM_BASE_NS;
286 static uint32_t *msg_ret = (uint32_t *)CASPER_RAM_BASE_NS;
287
288 /* NISTp-256 = 2^256-2^224+2^192+2^96-1 */
289 static uint32_t NISTp256[256 / 32u] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0x00000000,
290 0x00000000, 0x00000000, 0x00000001, 0xffffffffU};
291
292 /* The cardinality of the curve E(F_p) */
293 static uint32_t NISTp256_q[256 / 32u] = {0xfc632551U, 0xf3b9cac2U, 0xa7179e84U, 0xbce6faadU,
294 0xffffffffU, 0xffffffffU, 0x00000000, 0xffffffffU};
295
296 /* R = 2^256 mod p, the value "1" in Montgomery form. */
297 static uint32_t NISTr256[256 / 32u] = {0x00000001, 0x00000000, 0x00000000, 0xffffffffU,
298 0xffffffffU, 0xffffffffU, 0xfffffffeU, 0x00000000};
299
300 static uint32_t Np256[2] = {1, 0};
301
302 /* NISTp-384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */
303 static uint32_t NISTp384[384 / 32u] = {0xffffffffU, 0x00000000, 0x00000000, 0xffffffffU, 0xfffffffeU, 0xffffffffU,
304 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
305
306 /* The cardinality of the curve E(F_p) */
307 static uint32_t NISTp384_q[384 / 32u] = {0xccc52973U, 0xecec196aU, 0x48b0a77aU, 0x581a0db2U, 0xf4372ddfU, 0xc7634d81U,
308 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
309
310 /* R = 2^256 mod p, the value "1" in Montgomery form. */
311 static uint32_t NISTr384[384 / 32u] = {0x00000001, 0xffffffffU, 0xffffffffU, 0x00000000, 0x1, 0, 0, 0, 0, 0, 0, 0};
312
313 // -p^-1 mod 2^64 = 0x100000001
314 static uint32_t Np384[2] = {1, 1};
315
316 /* NISTp-521 = 2^521 - 1 */
317 static uint32_t NISTp521[576 / 32U] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
318 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
319 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0};
320
321 /* The cardinality of the curve E(F_p) */
322 static uint32_t NISTp521_q[576 / 32U] = {0x91386409U, 0xbb6fb71eU, 0x899c47aeU, 0x3bb5c9b8U, 0xf709a5d0U, 0x7fcc0148U,
323 0xbf2f966bU, 0x51868783U, 0xfffffffaU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
324 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0};
325
326 /* R = 2^576 mod p, the value "1" in Montgomery form. */
327 static uint32_t NISTr521[576 / 32U] = {0, 0x800000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
328
329 /* -p^-1 mod 2^64 = 1 */
330 static uint32_t Np521[2] = {1, 0};
331
332 /*******************************************************************************
333 * Prototypes
334 ******************************************************************************/
335
336 /* Convert a projective point (X1 : Y1 : Z1)
337 * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
338 * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
339 */
340 void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
341
342 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
343 * where (X1: Y1: Z1) != (X2 : Y2 : Z2)
344 * (X3 : Y3: Z3) may be the same as one of the inputs.
345 */
346 void Jac_addition(uint32_t *X3,
347 uint32_t *Y3,
348 uint32_t *Z3,
349 uint32_t *X1,
350 uint32_t *Y1,
351 uint32_t *Z1,
352 uint32_t *X2,
353 uint32_t *Y2,
354 uint32_t *Z2);
355
356 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
357 * where (X1: Y1: Z1) != (X2, Y2)
358 * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
359 * Source: 2004 Hankerson?Menezes?Vanstone, page 91.
360 */
361 void Jac_add_affine(
362 uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2);
363
364 /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
365 * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
366 * (X3 : Y3: Z3) may be the same as the input.
367 */
368 void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
369
370 /* Constant time elliptic curve scalar multiplication.
371 * Source: https://eprint.iacr.org/2014/130.pdf
372 * when using w = 4.
373 * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
374 * p is the prime used to define the finite field F_p
375 * q is the (prime) order of the curve
376 */
377 void Jac_scalar_multiplication(
378 uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q);
379
380 /* Compute the double scalar multiplication
381 * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
382 * Using Shamir's trick and precomputing 16 points.
383 * This code is *not* constant time since this is used
384 * for verification only.
385 */
386 void double_scalar_multiplication(uint32_t *X3,
387 uint32_t *Y3,
388 uint32_t *Z3,
389 uint32_t *X1,
390 uint32_t *Y1,
391 uint32_t *k1,
392 uint32_t *X2,
393 uint32_t *Y2,
394 uint32_t *k2);
395
396 /* Compute inversion modulo NIST-p384 using Fermats little theorem.
397 * Using c = a^(p-2) = a^(-1) mod p.
398 * This computes the modular inversion if all arithmetic is "regular"
399 * modular arithmetic or computes automatically the Montgomery inverse
400 * if all arithmetic is Montgomery arithmetic.
401 */
402 static void invert_mod_p384(uint32_t *c, uint32_t *a);
403
404 /* Modular inversion for NIST-P256 */
405 static void invert_mod_p256(uint32_t *c, uint32_t *a);
406
407 /* Modular inversion for NIST-P521 */
408 static void invert_mod_p521(uint32_t *c, uint32_t *a);
409
410 // A and C do not need to be in Casper memory
411 static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A);
412 static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A);
413 static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A);
414
415 static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
416 static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
417
418 /* Compute c = a/2 mod p where b is scratch space. */
419 static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b);
420
421 void CASPER_MEMCPY(void *dst, const void *src, size_t siz);
422
423 static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[]);
424
425 static uint8_t int8abs(int8_t v);
426
427 /* Constant time select c = a if m = 0 or
428 * c = b if m = 1
429 * a, b, c are n words
430 */
431 static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n);
432
433 /* Dumb n-limb addition of c=a+b, return carry. */
434 static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
435
436 #if 0
437 /* Dumb n-limb addition of c=a+b, return carry. */
438 static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
439
440 /* Dumb n-limb subtraction of c=a-b, return borrow. */
441 static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
442 #endif
443
444 /* Dumb n-limb subtraction of c=a-b, return borrow. */
445 static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
446
447 int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
448 const unsigned exp_pubkey,
449 const unsigned pubkey[N_wordlen_max],
450 unsigned MsgRet[WORK_BUFF_MUL4]);
451
452 int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
453 const unsigned exp_pubkey,
454 const unsigned pubkey[N_wordlen_max],
455 unsigned MsgRet[WORK_BUFF_MUL4]);
456
457 void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[]);
458
459 void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret);
460 void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[]);
461 void MultprecModulo(unsigned r_out[], const unsigned v[], int top);
462 void MultprecCiosMul(
463 unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np);
464 void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[]);
465
466 static void MultprecCiosMul_ct(
467 uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
468
469 static void MultprecCiosMul521_ct(
470 uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
471
472 static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c);
473 static void shiftright(uint32_t *z, uint32_t *x, uint32_t c);
474 static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c);
475
476 /*******************************************************************************
477 * Code
478 ******************************************************************************/
479
CA_MK_OFF(const void * addr)480 __STATIC_FORCEINLINE uint32_t CA_MK_OFF(const void *addr)
481 {
482 return ((uint32_t)(const uint32_t *)addr - s_casperRamBase);
483 }
484
485 #if 1
Accel_done(void)486 __STATIC_FORCEINLINE void Accel_done(void)
487 {
488 register uint32_t status;
489 do
490 {
491 status = CASPER->STATUS;
492 } while (0U == (status & CASPER_STATUS_DONE_MASK));
493 }
494
Accel_SetABCD_Addr(uint32_t ab,uint32_t cd)495 __STATIC_FORCEINLINE void Accel_SetABCD_Addr(uint32_t ab, uint32_t cd)
496 {
497 CASPER->CTRL0 = ab | (cd << 16); /* CDoffset << 16 | ABoffset */
498 }
499
Accel_crypto_mul(uint32_t ctrl1)500 __STATIC_FORCEINLINE void Accel_crypto_mul(uint32_t ctrl1)
501 {
502 CASPER->CTRL1 = ctrl1;
503 }
504 #else
505 #include "intrinsics.h"
506 #define Accel_done() \
507 { \
508 register uint32_t status; \
509 do \
510 { \
511 status = CASPER_Rd32b(CASPER_CP_STATUS); \
512 } while (0 == (status & CASPER_STATUS_DONE_MASK)); \
513 }
514 #if 0
515 __STATIC_FORCEINLINE void Accel_done(void)
516 {
517 register uint32_t status;
518 do
519 {
520 status = CASPER->STATUS;
521 } while (0 == (status & CASPER_STATUS_DONE_MASK));
522 }
523 #endif
524 #define Accel_SetABCD_Addr(ab, cd) CASPER_Wr32b((uint32_t)ab | ((uint32_t)cd << 16), CASPER_CP_CTRL0);
525 #define Accel_crypto_mul(ctrl1) CASPER_Wr32b((uint32_t)ctrl1, CASPER_CP_CTRL1);
526 #endif
527
Accel_IterOpcodeResaddr(uint32_t iter,uint32_t opcode,uint32_t resAddr)528 __STATIC_FORCEINLINE uint32_t Accel_IterOpcodeResaddr(uint32_t iter, uint32_t opcode, uint32_t resAddr)
529 {
530 return CASPER_CTRL1_ITER(iter) | CASPER_CTRL1_MODE(opcode) | (resAddr << 16);
531 }
532
CASPER_MEMCPY(void * dst,const void * src,size_t siz)533 void CASPER_MEMCPY(void *dst, const void *src, size_t siz)
534 {
535 bool bdst =
536 ((((uint32_t)(uint32_t *)dst) | 0x10000000u) >= ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
537 (((uint32_t)(uint32_t *)dst) | 0x10000000u) <
538 ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
539
540 bool bsrc = ((((uint32_t)(const uint32_t *)src) | 0x10000000u) >=
541 ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
542 (((uint32_t)(const uint32_t *)src) | 0x10000000u) <
543 ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
544
545 if (bdst && bsrc)
546 {
547 CASPER_MEMCPY_I2I(dst, src, siz);
548 }
549 else if (bdst && !bsrc)
550 {
551 CASPER_MEMCPY_N2I(dst, src, siz);
552 }
553 else if (!bdst && bsrc)
554 {
555 CASPER_MEMCPY_I2N(dst, src, siz);
556 }
557 else
558 {
559 (void)memcpy(dst, src, siz);
560 }
561 }
562
563 /* Constant time select c = a if m = 0 or
564 * c = b if m = 1
565 * a, b, c are n words
566 */
casper_select(uint32_t * c,uint32_t * a,uint32_t * b,int m,int n)567 static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n)
568 {
569 uint32_t m1 = 0U - (uint32_t)m, m2 = ~m1;
570 int i;
571
572 for (i = 0; i < n; i++)
573 {
574 SET_WORD(&c[i], (GET_WORD(&a[i]) & m2) | (GET_WORD(&b[i]) & m1));
575 }
576 }
577
578 /* Compute R`, which is R mod N. This is done using subtraction */
579 /* R has 1 in N_wordlen, but we do not fill it in since borrowed. */
580 /* Exp-pubkey only used to optimize for exp=3 */
MultprecMontCalcRp(unsigned Rp[],const unsigned exp_pubkey,const unsigned Nmod[])581 void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[])
582 {
583 uint32_t i;
584
585 /* R is 2^n where n is 1 bit longer than Nmod, so 1 followed by 32 or 64 0 words for example */
586 /* Note that Nmod's upper most bit has to be 1 by definition, so one subtract is enough. We */
587 /* do not set the 1 since it is "borrowed" so no point */
588 PreZeroW(i, Rp);
589 Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
590 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(Rp)));
591 Accel_done();
592 /* final borrow cannot happen since we know we started with a larger number */
593 }
594
595 /* MultprecMultiply - multiple w=u*v (per Knuth) */
596 /* w_out is 2x the size of u and v */
MultprecMultiply(unsigned w_out[],const unsigned u[],const unsigned v[])597 void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[])
598 {
599 uint32_t i, j;
600
601 /* Knuth 4.3.1 - Algorithm M */
602 /* Compute w = u * v */
603 /* u and v are N bits long in 32 bit word form */
604 /* w is 2*N bits long in 32 bit word form */
605 /* Note: We just multiply in place */
606
607 /* Step 1. Fill w[t-1:0] with 0s, the upper half will be written as we go */
608 PreZeroW(i, w_out);
609
610 /* We do 1st pass NOSUM so we do not have to 0 output */
611 Accel_SetABCD_Addr(CA_MK_OFF(&v[0]), CA_MK_OFF(u));
612 Accel_crypto_mul(
613 Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464NoSum, CA_MK_OFF(&w_out[0])));
614 Accel_done();
615 /* Step 2. iterate over N words of v using j */
616 for (j = 2U; j < N_wordlen; j += 2U)
617 {
618 /* Step 2b. Check for 0 on v word - skip if so since we 0ed already */
619 /* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
620 if (0U != (GET_WORD(&v[j])) || 0U != (GET_WORD(&v[j + 1U])))
621 {
622 Accel_SetABCD_Addr(CA_MK_OFF(&v[j]), CA_MK_OFF(u));
623 Accel_crypto_mul(
624 Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464Sum, CA_MK_OFF(&w_out[j])));
625 Accel_done();
626 }
627 }
628 }
629
630 /* MultprecModulo performs divide to get remainer as needed for RSA */
631 /* This performs (q,r) = u/v, but we do not keep q */
632 /* r_out is module (remainder) and is 2*N */
633 /* u is in r_out (1st N) at start (passed in) */
634 /* v is N long */
MultprecModulo(unsigned r_out[],const unsigned v[],int top)635 void MultprecModulo(unsigned r_out[], const unsigned v[], int top)
636 {
637 uint64_t u64; /* use 64 bit math mixed with 32 bit */
638 unsigned u32; /* allows us to work on U in 32 bit */
639 unsigned u_n, ul16, uh16, *u_shft; /* u_shft is because r_out is u initially */
640 unsigned vl16, vh16, v_Nm1;
641 unsigned q_hat, r_hat, q_over;
642 unsigned borrow, carry;
643 uint32_t i;
644 int j, tmp;
645
646 /* Knuth 4.3.1 - Algorithm D */
647 /* Compute q = u / v giving remainder r = u mod v */
648 /* -- we only want r, so we build qhat but do not store the Qs */
649 /* v is N long, with u,q,r 2N long because u is slowly replavced by r. */
650 /* We normalize/unnormlize per Knuth in the buffer (not copied) */
651
652 /* Step 1. Normalize value so MSb is in v[n-1]. Remember that v is */
653 /* the public key - to call it a 2048 bit number, they cannot have 0 */
654 /* in the MSb (or it would be less than 2048 bits) and so we know we */
655 /* are normalized already. Therefore, u is effectively shifted already. */
656 /* For u, we have it in r_out. u[n] holds any overflow */
657 /* Since divide on CM3/4 is 32/32=32, we break into 16 bit halves, but */
658 /* multiply can be 32x32=64. */
659 u_n = 0;
660 u_shft = r_out; /* u (shifted) is in r_out */
661
662 v_Nm1 = GET_WORD(&v[N_wordlen - 1U]); /* MSw of public key */
663 vl16 = v_Nm1 & 0xFFFFU; /* lower 16 */
664 vh16 = v_Nm1 >> 16; /* upper 16 */
665 /* Step 2. Iterate j from m-n down to 0 (M selected per Knuth as 2*N) */
666 for (j = top; j >= 0; j--)
667 {
668 /* Step 3. estimate q_hat as (U[j+n]*B + U[j+n-1]) / V[n-1] */
669 /* Note: using subset of Knuth algo since v is 1/2 len of u (which is */
670 /* from multiply or x^2 leading into this). */
671 u32 = u_n; /* pickup u4u3u2, knowing u4 is 0 */
672 u64 = ((uint64_t)u_n << 32) | GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 1U]);
673 ul16 = (unsigned int)(u64 & 0xFFFFU); /* lower 16 */
674 uh16 = (unsigned int)((u64 >> 16) & 0xFFFFU); /* upper 16 */
675
676 /* we see if even possible (u large enough relative to v) */
677 if ((u32 - v_Nm1) <= u32)
678 {
679 u32 -= v_Nm1;
680 q_over = 1; /* overflow from the sub */
681 }
682 else
683 {
684 q_over = 0;
685 }
686 /* q_hat = u32 / vh16 -- is the upper partial value */
687 /* estimate; if too much, then back down by 1 or 2 */
688 q_hat = u32 / vh16;
689 r_hat = u32 - (q_hat * vh16);
690 /* see if Q is more than 16 bits or remainder is too large (over div) */
691 if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | uh16)))
692 {
693 /* too much - undo a division */
694 q_hat--;
695 r_hat += vh16;
696 /* check if still too much */
697 if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | uh16)))
698 {
699 q_hat--; /* yes, so undo a 2nd */
700 }
701 }
702
703 /* compose u3u2uh16, then sub q_hat*v if OK */
704 u64 = (((uint64_t)u32 << 16) | uh16) - ((uint64_t)q_hat * v_Nm1);
705 if (0U != (u64 >> 48))
706 {
707 /* no, so add v back */
708 u32 = (unsigned)(u64 + v_Nm1);
709 q_hat--;
710 }
711 else
712 {
713 u32 = (unsigned)u64;
714 }
715 tmp = (int32_t)(uint32_t)(q_hat << 16); /* quotient upper part */
716
717 /* divide lower part: q = u2uh16ul16 / v. */
718 /* estimate and add back if over divdied */
719 q_hat = u32 / vh16;
720 r_hat = u32 - (q_hat * vh16);
721 if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | ul16)))
722 {
723 /* too much - undo a division */
724 q_hat--;
725 r_hat += vh16;
726 /* check if still too much */
727 if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | ul16)))
728 {
729 q_hat--; /* yes, so undo a 2nd */
730 }
731 }
732
733 /* compose u2uh16ul16, then sub q_hat*v if OK */
734 u64 = (((uint64_t)u32 << 16) | ul16) - ((uint64_t)q_hat * v_Nm1);
735 if (0U != (u64 >> 48))
736 {
737 /* no, so add v back */
738 r_hat = (unsigned)(u64 + v_Nm1);
739 q_hat--;
740 }
741 else
742 {
743 r_hat = (unsigned)u64;
744 }
745 q_hat |= (unsigned)tmp; /* other half of the quotient */
746 while ((q_over != 0U) || ((uint64_t)q_hat * GET_WORD(&v[N_wordlen - 2U])) >
747 ((1ULL << 32) * r_hat) + (uint64_t)GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 2U]))
748 { /* if Qhat>b, then reduce to b-1, then adjust up Rhat */
749 q_hat--;
750 r_hat += v_Nm1;
751 if (r_hat < v_Nm1)
752 {
753 break; /* no overflow */
754 /* else repeat since Rhat >= b */
755 }
756 }
757
758 /* Step 4. Multiply and subtract. We know the amount, */
759 /* so we do the schoolboy math. Have to do on */
760 /* the large value. */
761 if (q_hat != 0U)
762 {
763 borrow = 0;
764 for (i = 0; i < N_wordlen; i++)
765 {
766 u64 = (uint64_t)q_hat * GET_WORD(&v[i]) + borrow;
767 borrow = (unsigned)(u64 >> 32);
768 if (GET_WORD(&u_shft[i + (unsigned)j]) < (unsigned)u64)
769 {
770 borrow++; /* carry the overflow */
771 }
772 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) - (unsigned)u64);
773 }
774 u_n -= borrow; /* overflow from shift left does not fit otherwise */
775 }
776
777 /* Store 5. (update Q - we don't), and add back V to remainder if we over-subtracted */
778 /* That restores remainder to correct (we could only be off by 1) */
779 /* This should happen very rarely. */
780 if (u_n != 0U)
781 {
782 carry = 0;
783 for (i = 0; i < N_wordlen; i++)
784 {
785 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + carry);
786 carry = (GET_WORD(&u_shft[i + (unsigned)j]) < carry) ? 1U : 0U;
787 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + GET_WORD(&v[i]));
788 if (GET_WORD(&u_shft[i + (unsigned)j]) < GET_WORD(&v[i]))
789 {
790 carry++;
791 }
792 }
793 }
794 u_n = GET_WORD(
795 &u_shft[(uint32_t)j + N_wordlen - 1U]); /* hold upper part of u to catch overflow (to borrow from) */
796 }
797 /* low N bits of r are valid as remainder */
798 }
799
800 /* We convert X into a Mont form number. Note length of arrays: */
801 /* x is N_wordlen, Nmod is N_wordlen */
802 /* Rp is N_wordlen (it is R` which is R mod N) */
803 /* Xmont_out is N_wordlen*2+1 */
MultprecMontPrepareX(unsigned Xmont_out[],const unsigned x[],const unsigned Rp[],const unsigned Nmod[])804 void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[])
805 {
806 MultprecMultiply(Xmont_out, x, Rp);
807 MultprecModulo(Xmont_out, Nmod, (int32_t)N_wordlen);
808 }
809
MultprecGenNp64(const unsigned * Nmod,unsigned * np64_ret)810 void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret) /* only pass the low order double word */
811 {
812 uint64_t nprime, Nmod_0;
813 Nmod_0 = GET_WORD(&Nmod[0]) | ((uint64_t)GET_WORD(&Nmod[1]) << 32);
814
815 #define COMP_NPN_1 ((2U - Nmod_0 * nprime) * nprime) /* computes N`*N0=1 mod 2^P where P is the partial built up */
816 nprime = (((2U + Nmod_0) & 4U) << 1) + Nmod_0; /* mod 2^4 */
817 nprime = COMP_NPN_1;
818 nprime = COMP_NPN_1;
819 nprime = COMP_NPN_1;
820 nprime = COMP_NPN_1;
821 /* 8 multiplies of uint64_t */
822 *((uint64_t *)(uintptr_t)np64_ret) = (~0ULL - nprime) + 1ULL;
823 }
824
825 /* CIOS Multiply. This is the Coarse Integrated form where the values are */
826 /* multiplied and reduced for each step of "i". This uses less memory and */
827 /* is faster as a result. Note that this is used to square as well as mul, */
828 /* so not as fast as pure squaring could be. */
MultprecCiosMul(unsigned w_out[],const unsigned a[],const unsigned b[],const unsigned Nmod[],const unsigned * Np)829 void MultprecCiosMul(
830 unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np)
831 {
832 int j;
833 uint32_t i;
834 uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
835 uint64_t Np64;
836 uint64_t carry;
837 uint64_t *a64, *b64, *w64, *N64;
838
839 Np64 = *(uint64_t *)(uintptr_t)Np;
840
841 a64 = (uint64_t *)(uintptr_t)a;
842 b64 = (uint64_t *)(uintptr_t)b;
843 w64 = (uint64_t *)(uintptr_t)w_out;
844 N64 = (uint64_t *)(uintptr_t)Nmod;
845
846 if (a != NULL)
847 { /* if !a, we are reducing only */
848 PreZeroW(i, w_out);
849 }
850 SET_DWORD(&w64[N_dwordlen], 0ULL);
851 SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
852 /* with accelerator */
853
854 /* loop i and then reduce after each j round */
855 for (i = 0; i < N_dwordlen; i++)
856 {
857 /* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
858 /* push-pull: we do a*b and then separately m*n (reduce) */
859 if (a != NULL)
860 { /* if mul&reduce vs. reduce only */
861 carry = GET_DWORD(&w64[N_dwordlen]);
862 Accel_SetABCD_Addr(CA_MK_OFF(&b64[i]), CA_MK_OFF(a64));
863 Accel_crypto_mul(
864 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
865 Accel_done();
866 /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
867 /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
868 /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
869 /* w64[N_dwordlen+1] = g_carry; */
870 carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
871 SET_DWORD(&w64[N_dwordlen + 1U], carry);
872 }
873 SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */
874
875 /* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
876 /* need its carry. We let the accel do this separately - really need a mode to */
877 /* do this "reduce" since it is natural */
878 carry = GET_DWORD(&w64[N_dwordlen]);
879 Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
880 Accel_crypto_mul(
881 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
882 Accel_done();
883 carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
884
885 Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
886 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
887
888 Accel_done();
889 SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
890 }
891
892 /* now check if need to subtract Nmod */
893 if (0U != (GET_WORD(&w_out[N_wordlen])))
894 {
895 j = 1; /* we have to subtract for sure if carry up */
896 }
897 else
898 {
899 j = 0;
900 for (i = N_wordlen - 1U; i > 0U; i--)
901 {
902 if (GET_WORD(&w_out[i]) != GET_WORD(&Nmod[i]))
903 {
904 j = (int32_t)(GET_WORD(&w_out[i]) > GET_WORD(&Nmod[i])); /* if larger sub */
905 break; /* we would remove the break if worrying about side channel */
906 }
907 }
908 }
909 if (0 == j)
910 {
911 return; /* Is smaller than Nmod, so done. */
912 }
913 Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
914 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
915 Accel_done();
916 /* last borrow is OK since we know it could only be <2N and */
917 }
918
919 /* RSA_MontSignatureToPlaintextFast: */
920 /* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
921 /* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
922 /* signature = N bitpos len long "message" to process in Montgomery form - so saving conversion (divide) */
923 /* pubkey = N bitpos len long public key to process signature with */
924 /* returns: 0 */
925 /* */
926 /* Algo: compute M = signaturen^e mod public_key */
927 /* where M is original plaintext, signature is signed value */
928 /* note: e is usually either 0x3 or 0x10001 */
RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],const unsigned exp_pubkey,const unsigned pubkey[N_wordlen_max],unsigned MsgRet[WORK_BUFF_MUL4])929 int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
930 const unsigned exp_pubkey,
931 const unsigned pubkey[N_wordlen_max],
932 unsigned MsgRet[WORK_BUFF_MUL4])
933 {
934 int bidx = 0;
935 int bitpos;
936 unsigned np64[2];
937
938 /* MsgRet working area: */
939 /* 0..N = RESULT, starting with S` */
940 /* N..N*2 = S` and then working BASE during math. */
941 /* N*2..N*4+2 = temp working area for Mont mul */
942
943 /* 1. Copy sig into MsgRet so we have one working result buffer */
944 CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
945 (const uint32_t *)(uintptr_t)mont_signature, N_bytelen);
946 MultprecGenNp64(pubkey, np64); /* Generate N` from LSW of N (LSW being lowest 64b word) */
947 bitpos = (int8_t)(uint8_t)(31U - __CLZ(exp_pubkey)); /* count of bits after the left most 1 */
948 while (--bitpos >= 0)
949 {
950 /* This operates on: */
951 /* result = 1; */
952 /* base = signature */
953 /* loop while exponent bits from MSb to LSb */
954 /* if (exp bit is 1) */
955 /* result = result * base */
956 /* base = base^2 */
957 /* Because the MSb of exp is always 1 by definition, we can invert this a bit: */
958 /* base = signature` */
959 /* result = base; equivalent to result = 1*base from 1st pass, but now square is needed 1st */
960 /* loop while exponent bits from MSb-1 to LSb */
961 /* base = base^2 */
962 /* if (exp bit is 1) */
963 /* result = result * base */
964 /* This ends up doing the same thing but skips two wasteful steps of multiplying by 1 and */
965 /* a final squaring never used. */
966 /* */
967 /* Next we have the problem that CIOS mul needs a separate dest buffer. So, we bounce */
968 /* base between base and temp, and likewise for result. */
969 MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase],
970 &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
971 &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], pubkey, np64);
972 if (0U != (exp_pubkey & (uint32_t)(uint8_t)(1U << (uint8_t)bitpos))) /* where e is 1 */
973 {
974 /* result has result, so we need to work into other temp area */
975 MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
976 &MsgRet[kCASPER_RamOffset_Result],
977 &MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase], pubkey, np64);
978 /* we have to copy back to result */
979
980 // CASPER_MEMCPY_I2I(&MsgRet[kCASPER_RamOffset_Result],
981 // &MsgRet[bidx ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], N_bytelen);
982 }
983 else
984 {
985 bidx = (int32_t)(uint32_t) ~(unsigned)bidx;
986 }
987 }
988
989 CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
990 (uint32_t *)(uintptr_t)&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
991 N_bytelen);
992
993 /* final step is one more reduction to get back to normal form (ie. divide R out) */
994 MultprecCiosMul(&MsgRet[kCASPER_RamOffset_Result], NULL, NULL, pubkey, np64);
995 return (0); /* always 0 */
996 }
997
998 /* RSA_SignatureToPlaintextFast: */
999 /* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
1000 /* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
1001 /* signature = N bitpos len long "message" to process in normal form - so converted to Mont form */
1002 /* pubkey = N bitpos len long public key to process signature with */
1003 /* returns: 0 */
1004 /* */
1005 /* Algo: compute M = signaturen^e mod public_key */
1006 /* where M is original plaintext, signature is signed value */
1007 /* note: e is usually either 0x3 or 0x10001 */
RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],const unsigned exp_pubkey,const unsigned pubkey[N_wordlen_max],unsigned MsgRet[WORK_BUFF_MUL4])1008 int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
1009 const unsigned exp_pubkey,
1010 const unsigned pubkey[N_wordlen_max],
1011 unsigned MsgRet[WORK_BUFF_MUL4])
1012 {
1013 /* MsgRet working area: */
1014 /* 0..N = RESULT, starting with S`; it is used for R` just during creation of S` */
1015 /* N..N*2 = S` and then working BASE during math. Note overflow beyond N*2 when making S` */
1016 /* N*2..N*4+2 = temp working area for Mont mul */
1017
1018 MultprecMontCalcRp(&MsgRet[kCASPER_RamOffset_Result], exp_pubkey, pubkey); /* calculate R` (=R mod N) */
1019 MultprecMontPrepareX(&MsgRet[kCASPER_RamOffset_Base], signature, &MsgRet[kCASPER_RamOffset_Result],
1020 pubkey); /* X*R1` mod N */
1021 return (RSA_MontSignatureToPlaintextFast(&MsgRet[kCASPER_RamOffset_Base], exp_pubkey, pubkey, MsgRet));
1022 }
1023
1024 /*!
1025 * brief Performs modular exponentiation - (A^E) mod N.
1026 *
1027 * This function performs modular exponentiation.
1028 *
1029 * param base CASPER base address
1030 * param signature first addend (in little endian format)
1031 * param pubN modulus (in little endian format)
1032 * param wordLen Size of pubN in bytes
1033 * param pubE exponent
1034 * param[out] plaintext Output array to store result of operation (in little endian format)
1035 */
CASPER_ModExp(CASPER_Type * base,const uint8_t * signature,const uint8_t * pubN,size_t wordLen,uint32_t pubE,uint8_t * plaintext)1036 void CASPER_ModExp(
1037 CASPER_Type *base, const uint8_t *signature, const uint8_t *pubN, size_t wordLen, uint32_t pubE, uint8_t *plaintext)
1038 {
1039 #define PK_LOC &msg_ret[kCASPER_RamOffset_Modulus]
1040 #define SIG_LOC &msg_ret[(unsigned)kCASPER_RamOffset_Modulus + N_wordlen_max]
1041
1042 N_wordlen = wordLen; /* set global variable for key length - used by RSA_SignatureToPlaintextFast() */
1043 CASPER_MEMCPY_N2I(PK_LOC, (const uint32_t *)(uintptr_t)pubN, N_bytelen);
1044 CASPER_MEMCPY_N2I(SIG_LOC, (const uint32_t *)(uintptr_t)signature, N_bytelen);
1045 (void)RSA_SignatureToPlaintextFast((const unsigned *)(uintptr_t)(SIG_LOC), pubE,
1046 (const unsigned *)(uintptr_t)(PK_LOC), (unsigned int *)(uintptr_t)msg_ret);
1047
1048 CASPER_MEMCPY_I2N((uint32_t *)(uintptr_t)plaintext, msg_ret, N_bytelen);
1049 }
1050
1051 /*!
1052 * brief Enables clock and disables reset for CASPER peripheral.
1053 *
1054 * Enable clock and disable reset for CASPER.
1055 *
1056 * param base CASPER base address
1057 */
CASPER_Init(CASPER_Type * base)1058 void CASPER_Init(CASPER_Type *base)
1059 {
1060 #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
1061 #if defined(CASPER_CLOCKS)
1062 CLOCK_EnableClock(kCLOCK_Casper);
1063 #endif
1064 #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
1065 #if defined(CASPER_RSTS)
1066 RESET_PeripheralReset(kCASPER_RST_SHIFT_RSTn);
1067 #endif
1068 #if defined(FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE) && (FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE > 0)
1069 /* Enable hardware interleaving to RAMX0 and RAMX1 for CASPER */
1070 SYSCON->CASPER_CTRL = SYSCON_CASPER_CTRL_INTERLEAVE(1);
1071 #endif /* FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE */
1072 /* If Casper init is called with secure address, use secure addres also for accessing Casper RAM. */
1073 s_casperRamBase = (unsigned)CASPER_RAM_BASE_NS | ((uint32_t)base & 0x10000000u);
1074 msg_ret = (uint32_t *)s_casperRamBase;
1075 }
1076
1077 /*!
1078 * brief Disables clock for CASPER peripheral.
1079 *
1080 * Disable clock and enable reset.
1081 *
1082 * param base CASPER base address
1083 */
CASPER_Deinit(CASPER_Type * base)1084 void CASPER_Deinit(CASPER_Type *base)
1085 {
1086 #if defined(CASPER_RSTS)
1087 RESET_SetPeripheralReset(kCASPER_RST_SHIFT_RSTn);
1088 #endif
1089 #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
1090 #if defined(CASPER_CLOCKS)
1091 CLOCK_DisableClock(kCLOCK_Casper);
1092 #endif
1093 #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
1094 }
1095
1096 /* New ECC code which uses Casper. */
1097
1098 /* Set the prime modulus mod in Casper memory.
1099 */
CASPER_ecc_init(casper_algo_t curve)1100 void CASPER_ecc_init(casper_algo_t curve)
1101 {
1102 uint32_t *mod;
1103
1104 if (curve == kCASPER_ECC_P256)
1105 {
1106 N_wordlen = 256U / 32U;
1107 mod = NISTp256;
1108 }
1109
1110 if (curve == kCASPER_ECC_P384)
1111 {
1112 N_wordlen = 384U / 32U;
1113 mod = NISTp384;
1114 }
1115
1116 if (curve == kCASPER_ECC_P521)
1117 {
1118 N_wordlen = 576U / 32U;
1119 mod = NISTp521;
1120 }
1121
1122 CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U)], mod, N_wordlen * sizeof(uint32_t));
1123 uint8_t a[((CASPER_MAX_ECC_SIZE_WORDLEN + 4U) - CASPER_MAX_ECC_SIZE_WORDLEN) * sizeof(uint32_t)] = {0};
1124 CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U) + N_wordlen], a, ((N_wordlen + 4U) - N_wordlen) * sizeof(uint32_t));
1125 }
1126
CASPER_ECC_equal(int * res,uint32_t * op1,uint32_t * op2)1127 void CASPER_ECC_equal(int *res, uint32_t *op1, uint32_t *op2)
1128 {
1129 uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1130 uint32_t b[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1131 uint32_t c = 0;
1132 CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
1133 CASPER_MEMCPY(b, op2, N_wordlen * sizeof(uint32_t));
1134
1135 do
1136 {
1137 uint32_t _i;
1138 c = (a[0] ^ b[0]);
1139 for (_i = 1; _i < N_wordlen; _i++)
1140 {
1141 c |= (a[_i] ^ b[_i]);
1142 }
1143 } while (false);
1144
1145 *res = (int32_t)c;
1146 }
1147
CASPER_ECC_equal_to_zero(int * res,uint32_t * op1)1148 void CASPER_ECC_equal_to_zero(int *res, uint32_t *op1)
1149 {
1150 uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1151 uint32_t c = 0;
1152 CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
1153
1154 do
1155 {
1156 uint32_t _i;
1157 c = a[0];
1158 for (_i = 1; _i < N_wordlen; _i++)
1159 {
1160 c |= a[_i];
1161 }
1162 } while (false);
1163
1164 *res = (int32_t)c;
1165 }
1166
CASPER_ECC_SECP256R1_Mul(CASPER_Type * base,uint32_t resX[8],uint32_t resY[8],uint32_t X[8],uint32_t Y[8],uint32_t scalar[8])1167 void CASPER_ECC_SECP256R1_Mul(
1168 CASPER_Type *base, uint32_t resX[8], uint32_t resY[8], uint32_t X[8], uint32_t Y[8], uint32_t scalar[8])
1169 {
1170 uint32_t X1[8] = {0};
1171 uint32_t Y1[8] = {0};
1172 toMontgomery_ECC_P256(X1, X);
1173 toMontgomery_ECC_P256(Y1, Y);
1174
1175 CASPER_MEMCPY(
1176 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1177 X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1178 CASPER_MEMCPY(
1179 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1180 Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1181
1182 Jac_scalar_multiplication(
1183 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1184 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1185 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1186 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1187 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1188 scalar, NISTp256, NISTp256_q);
1189
1190 Jac_toAffine(
1191 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1192 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1193 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1194 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1195 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1196
1197 /* Montgomery to Normal */
1198 /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1199 uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
1200 one[0] = 0x1u;
1201 CASPER_MEMCPY(
1202 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1203 one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1204 multiply_casper(
1205 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1206 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1207 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1208 multiply_casper(
1209 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1210 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1211 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1212
1213 /* copy out to result */
1214 CASPER_MEMCPY(
1215 resX,
1216 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1217 (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1218 CASPER_MEMCPY(
1219 resY,
1220 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1221 (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1222 }
1223
CASPER_ECC_SECP256R1_MulAdd(CASPER_Type * base,uint32_t resX[8],uint32_t resY[8],uint32_t X1[8],uint32_t Y1[8],uint32_t scalar1[8],uint32_t X2[8],uint32_t Y2[8],uint32_t scalar2[8])1224 void CASPER_ECC_SECP256R1_MulAdd(CASPER_Type *base,
1225 uint32_t resX[8],
1226 uint32_t resY[8],
1227 uint32_t X1[8],
1228 uint32_t Y1[8],
1229 uint32_t scalar1[8],
1230 uint32_t X2[8],
1231 uint32_t Y2[8],
1232 uint32_t scalar2[8])
1233 {
1234 uint32_t zeroes[(kCASPER_ECC_P256_wordlen + 4U)] = {0};
1235
1236 CASPER_MEMCPY(
1237 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1238 X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1239 CASPER_MEMCPY(
1240 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1241 Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1242
1243 CASPER_MEMCPY(
1244 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1245 X2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1246 CASPER_MEMCPY(
1247 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1248 Y2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1249
1250 toMontgomery_ECC_P256(
1251 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1252 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1253 toMontgomery_ECC_P256(
1254 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1255 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1256 toMontgomery_ECC_P256(
1257 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1258 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1259 toMontgomery_ECC_P256(
1260 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1261 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1262
1263 CASPER_MEMCPY(
1264 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1265 zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1266 CASPER_MEMCPY(
1267 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1268 zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1269 CASPER_MEMCPY(
1270 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1271 zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1272 double_scalar_multiplication(
1273 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1274 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1275 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1276 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1277 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1278 scalar1,
1279 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1280 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1281 scalar2);
1282
1283 Jac_toAffine(
1284 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1285 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1286 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1287 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1288 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1289
1290 uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
1291 one[0] = 0x1u;
1292 CASPER_MEMCPY(
1293 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1294 one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1295 multiply_casper(
1296 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1297 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1298 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1299 multiply_casper(
1300 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1301 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1302 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1303
1304 CASPER_MEMCPY(resX,
1305 (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
1306 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
1307 (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1308 CASPER_MEMCPY(resY,
1309 (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
1310 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
1311 (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1312 }
1313
CASPER_ECC_SECP384R1_Mul(CASPER_Type * base,uint32_t resX[12],uint32_t resY[12],uint32_t X[12],uint32_t Y[12],uint32_t scalar[12])1314 void CASPER_ECC_SECP384R1_Mul(
1315 CASPER_Type *base, uint32_t resX[12], uint32_t resY[12], uint32_t X[12], uint32_t Y[12], uint32_t scalar[12])
1316 {
1317 uint32_t X1[12] = {0};
1318 uint32_t Y1[12] = {0};
1319 toMontgomery_ECC_P384(X1, X);
1320 toMontgomery_ECC_P384(Y1, Y);
1321
1322 CASPER_MEMCPY(
1323 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1324 X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1325 CASPER_MEMCPY(
1326 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1327 Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1328
1329 Jac_scalar_multiplication(
1330 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1331 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1332 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1333 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1334 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1335 scalar, NISTp384, NISTp384_q);
1336
1337 Jac_toAffine(
1338 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1339 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1340 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1341 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1342 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1343
1344 /* Montgomery to Normal */
1345 /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1346 uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1347 CASPER_MEMCPY(
1348 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1349 one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1350 multiply_casper(
1351 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1352 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1353 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1354 multiply_casper(
1355 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1356 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1357 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1358
1359 /* copy out to result */
1360 CASPER_MEMCPY(
1361 resX,
1362 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1363 (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1364 CASPER_MEMCPY(
1365 resY,
1366 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1367 (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1368 }
1369
CASPER_ECC_SECP384R1_MulAdd(CASPER_Type * base,uint32_t resX[12],uint32_t resY[12],uint32_t X1[12],uint32_t Y1[12],uint32_t scalar1[12],uint32_t X2[12],uint32_t Y2[12],uint32_t scalar2[12])1370 void CASPER_ECC_SECP384R1_MulAdd(CASPER_Type *base,
1371 uint32_t resX[12],
1372 uint32_t resY[12],
1373 uint32_t X1[12],
1374 uint32_t Y1[12],
1375 uint32_t scalar1[12],
1376 uint32_t X2[12],
1377 uint32_t Y2[12],
1378 uint32_t scalar2[12])
1379 {
1380 CASPER_MEMCPY(
1381 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1382 X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1383 CASPER_MEMCPY(
1384 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1385 Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1386
1387 CASPER_MEMCPY(
1388 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1389 X2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1390 CASPER_MEMCPY(
1391 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1392 Y2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1393
1394 toMontgomery_ECC_P384(
1395 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1396 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1397 toMontgomery_ECC_P384(
1398 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1399 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1400 toMontgomery_ECC_P384(
1401 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1402 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1403 toMontgomery_ECC_P384(
1404 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1405 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1406
1407 double_scalar_multiplication(
1408 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1409 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1410 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1411 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1412 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1413 scalar1,
1414 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1415 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1416 scalar2);
1417
1418 Jac_toAffine(
1419 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1420 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1421 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1422 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1423 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1424
1425 uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1426 CASPER_MEMCPY(
1427 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1428 one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1429 multiply_casper(
1430 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1431 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1432 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1433 multiply_casper(
1434 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1435 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1436 &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1437
1438 CASPER_MEMCPY(resX,
1439 (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
1440 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
1441 (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1442 CASPER_MEMCPY(resY,
1443 (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
1444 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
1445 (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1446 }
1447
CASPER_ECC_SECP521R1_Mul(CASPER_Type * base,uint32_t resX[18],uint32_t resY[18],uint32_t X[18],uint32_t Y[18],uint32_t scalar[18])1448 void CASPER_ECC_SECP521R1_Mul(
1449 CASPER_Type *base, uint32_t resX[18], uint32_t resY[18], uint32_t X[18], uint32_t Y[18], uint32_t scalar[18])
1450 {
1451 uint32_t X1[18] = {0};
1452 uint32_t Y1[18] = {0};
1453 toMontgomery_ECC_P521(X1, X);
1454 toMontgomery_ECC_P521(Y1, Y);
1455
1456 CASPER_MEMCPY(
1457 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1458 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1459 X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1460 CASPER_MEMCPY(
1461 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1462 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1463 Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1464
1465 Jac_scalar_multiplication(
1466 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1467 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1468 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1469 7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1470 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1471 8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1472 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1473 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1474 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1475 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1476 scalar, NISTp521, NISTp521_q);
1477
1478 Jac_toAffine(
1479 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1480 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1481 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1482 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1483 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1484 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1485 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1486 7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1487 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1488 8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1489
1490 /* Montgomery to Normal */
1491 /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1492 uint32_t one[18] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1493 CASPER_MEMCPY(
1494 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1495 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1496 one, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1497 multiply_casper(
1498 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1499 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1500 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1501 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1502 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1503 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1504 multiply_casper(
1505 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1506 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1507 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1508 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1509 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1510 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1511
1512 /* copy out to result */
1513 CASPER_MEMCPY(
1514 resX,
1515 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1516 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1517 (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1518 CASPER_MEMCPY(
1519 resY,
1520 &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1521 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1522 (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1523 }
1524
CASPER_ECC_SECP521R1_MulAdd(CASPER_Type * base,uint32_t resX[18],uint32_t resY[18],uint32_t X1[18],uint32_t Y1[18],uint32_t scalar1[18],uint32_t X2[18],uint32_t Y2[18],uint32_t scalar2[18])1525 void CASPER_ECC_SECP521R1_MulAdd(CASPER_Type *base,
1526 uint32_t resX[18],
1527 uint32_t resY[18],
1528 uint32_t X1[18],
1529 uint32_t Y1[18],
1530 uint32_t scalar1[18],
1531 uint32_t X2[18],
1532 uint32_t Y2[18],
1533 uint32_t scalar2[18])
1534 {
1535 uint32_t zeroes[(kCASPER_ECC_P521_wordlen + 4U)] = {0};
1536
1537 CASPER_MEMCPY(
1538 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1539 X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1540 CASPER_MEMCPY(
1541 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1542 Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1543
1544 CASPER_MEMCPY(
1545 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1546 X2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1547 CASPER_MEMCPY(
1548 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1549 Y2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1550
1551 toMontgomery_ECC_P521(
1552 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1553 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1554 toMontgomery_ECC_P521(
1555 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1556 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1557 toMontgomery_ECC_P521(
1558 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1559 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1560 toMontgomery_ECC_P521(
1561 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1562 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1563
1564 CASPER_MEMCPY(
1565 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1566 zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1567 CASPER_MEMCPY(
1568 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1569 zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1570 CASPER_MEMCPY(
1571 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1572 zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1573 double_scalar_multiplication(
1574 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1575 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1576 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1577 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1578 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1579 scalar1,
1580 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1581 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1582 scalar2);
1583
1584 Jac_toAffine(
1585 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1586 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1587 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1588 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1589 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1590
1591 uint32_t one[(kCASPER_ECC_P521_wordlen + 4U)] = {0x0};
1592 one[0] = 0x1u;
1593 CASPER_MEMCPY(
1594 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1595 one, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1596 multiply_casper(
1597 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1598 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1599 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1600 multiply_casper(
1601 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) +
1602 2U * ((uint32_t)(uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1603 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1604 &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1605
1606 CASPER_MEMCPY(
1607 resX,
1608 (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
1609 (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1610 CASPER_MEMCPY(
1611 resY,
1612 (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
1613 (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1614 }
1615
1616 // CIOS Multiply. This is the Coarse Integrated form where the values are
1617 // multiplied and reduced for each step of "i". This uses less memory and
1618 // is faster as a result. Note that this is used to square as well as mul,
1619 // so not as fast as pure squaring could be.
MultprecCiosMul_ct(uint32_t w_out[],const uint32_t a[],const uint32_t b[],const uint32_t Nmod[],const uint32_t * Np)1620 static void MultprecCiosMul_ct(
1621 uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
1622 {
1623 uint32_t j;
1624 uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
1625 uint64_t Np64;
1626 uint64_t carry;
1627 uint64_t *a64, *b64, *w64, *N64;
1628 uint32_t *T1 = &CASPER_MEM[0], borrow;
1629
1630 Np64 = *(uint64_t *)(uintptr_t)Np;
1631
1632 a64 = (uint64_t *)(uintptr_t)a;
1633 b64 = (uint64_t *)(uintptr_t)b;
1634 w64 = (uint64_t *)(uintptr_t)w_out;
1635 N64 = (uint64_t *)(uintptr_t)Nmod;
1636
1637 if (a != NULL)
1638 { /* if !a, we are reducing only */
1639 PreZeroW(j, w_out);
1640 }
1641 SET_DWORD(&w64[N_dwordlen], 0ULL);
1642 SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
1643 /* with accelerator */
1644
1645 /* loop j and then reduce after each j round */
1646 for (j = 0; j < N_dwordlen; j++)
1647 {
1648 /* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
1649 /* push-pull: we do a*b and then separately m*n (reduce) */
1650 if (a != NULL)
1651 { /* if mul&reduce vs. reduce only */
1652 carry = GET_DWORD(&w64[N_dwordlen]);
1653 Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
1654 Accel_crypto_mul(
1655 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
1656 Accel_done();
1657 /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
1658 /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
1659 /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
1660 /* w64[N_dwordlen+1] = g_carry; */
1661 carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
1662 SET_DWORD(&w64[N_dwordlen + 1U], carry);
1663 }
1664 SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */
1665
1666 /* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
1667 /* need its carry. We let the accel do this separately - really need a mode to */
1668 /* do this "reduce" since it is natural */
1669 carry = GET_DWORD(&w64[N_dwordlen]);
1670 Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
1671 Accel_crypto_mul(
1672 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
1673 Accel_done();
1674 carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
1675
1676 Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
1677 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
1678
1679 Accel_done();
1680 SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
1681 }
1682
1683 /* now check if need to subtract Nmod */
1684 CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
1685
1686 /* Compute w = w - N */
1687 Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
1688 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
1689 Accel_done();
1690
1691 // if w_out > T1 then there was a borrow
1692 borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
1693
1694 SET_WORD(&w_out[N_wordlen + 1U], 0);
1695 SET_WORD(&w_out[N_wordlen], 0);
1696 casper_select(w_out, w_out, T1, (int32_t)borrow, (int16_t)(uint16_t)N_wordlen);
1697 }
1698
1699 /* Compute C = A - B % mod
1700 * Assumes all operand have two extra limbs to store carry.
1701 */
CASPER_montsub(uint32_t * C,uint32_t * A,uint32_t * B,uint32_t * mod)1702 static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
1703 {
1704 uint64_t *b64, *c64, *m64, *tmp;
1705 int borrow;
1706
1707 b64 = (uint64_t *)(uintptr_t)B;
1708 c64 = (uint64_t *)(uintptr_t)C;
1709 m64 = (uint64_t *)(uintptr_t)mod;
1710
1711 tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
1712
1713 CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
1714
1715 /* Compute tmp = A - B. */
1716 Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
1717
1718 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(tmp)));
1719 Accel_done();
1720
1721 borrow = (int32_t)((GET_WORD(&((uint32_t *)(uintptr_t)tmp)[N_wordlen - 1U])) > GET_WORD(&A[N_wordlen - 1U]));
1722 CASPER_MEMCPY(c64, tmp, N_wordlen * sizeof(uint32_t));
1723
1724 /* Compute C = Mod + tmp */
1725 Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
1726 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(c64)));
1727 Accel_done();
1728
1729 casper_select(C, (uint32_t *)(uintptr_t)tmp, C, borrow, (int16_t)(uint16_t)N_wordlen);
1730 }
1731
1732 /* Compute C = A + B % mod
1733 * Assumes all operand have two extra limbs to store carry.
1734 */
CASPER_montadd(uint32_t * C,uint32_t * A,uint32_t * B,uint32_t * mod)1735 static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
1736 {
1737 uint64_t *b64, *c64, *m64, *tmp;
1738 int borrow;
1739
1740 b64 = (uint64_t *)(uintptr_t)B;
1741 c64 = (uint64_t *)(uintptr_t)C;
1742 m64 = (uint64_t *)(uintptr_t)mod;
1743
1744 tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
1745
1746 CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
1747 SET_DWORD(&tmp[N_wordlen / 2U], 0ULL);
1748 SET_DWORD(&b64[N_wordlen / 2U], 0ULL);
1749 SET_DWORD(&m64[N_wordlen / 2U], 0ULL);
1750
1751 /* Compute tmp = A + B using one additonal double-length limb. */
1752 Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
1753
1754 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(tmp)));
1755 Accel_done();
1756
1757 CASPER_MEMCPY(c64, tmp, (N_wordlen + 2U) * sizeof(uint32_t));
1758
1759 /* Compute C = Mod - tmp */
1760 Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
1761 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(c64)));
1762 Accel_done();
1763
1764 // borrow = g_carry;
1765 borrow = (int32_t)(GET_WORD(&C[N_wordlen]) > GET_WORD(&(((uint32_t *)(uintptr_t)tmp)[N_wordlen])));
1766 casper_select(C, C, (uint32_t *)(uintptr_t)tmp, borrow, (int16_t)(uint16_t)N_wordlen);
1767 }
1768
1769 /* Compute c = a/2 mod p where b is scratch space. */
CASPER_half(uint32_t * c,uint32_t * a,uint32_t * b)1770 static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b)
1771 {
1772 shiftright(b, a, 1U); /* Compute a/2 and (a+p)/2 */
1773
1774 /* Compute tmp = a + p using one additonal double-length limb. */
1775 CASPER_MEMCPY(c, a, N_wordlen * sizeof(uint32_t));
1776 SET_WORD(&c[N_wordlen], 0);
1777 SET_WORD(&c[N_wordlen + 1U], 0U);
1778
1779 Accel_SetABCD_Addr(CA_MK_OFF(((uint64_t *)(uintptr_t)&CASPER_MEM[(N_wordlen + 4U)])), 0);
1780 Accel_crypto_mul(
1781 Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(((uint64_t *)(uintptr_t)c))));
1782 Accel_done();
1783
1784 shiftright(c, c, 1U);
1785 SET_WORD(&c[N_wordlen - 1U], GET_WORD(&c[N_wordlen - 1U]) | (GET_WORD(&c[N_wordlen]) << 31));
1786 SET_WORD(&c[N_wordlen], 0U);
1787 casper_select(c, b, c, (int32_t)(uint32_t)(GET_WORD(&a[0]) & 1U), (int16_t)(uint16_t)(N_wordlen));
1788 }
1789
casper_get_word(uint32_t * addr)1790 static uint32_t casper_get_word(uint32_t *addr)
1791 {
1792 return GET_WORD(addr);
1793 }
1794
1795 /* Shift right by 1 <= c <= 31. z[] and x[] in system RAM, no interleaving macros used. */
shiftrightSysram(uint32_t * z,uint32_t * x,uint32_t c)1796 static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c)
1797 {
1798 z[0] = (x[1] << (32U - (c))) | (x[0] >> (c));
1799 z[1] = (x[2] << (32U - (c))) | (x[1] >> (c));
1800 z[2] = (x[3] << (32U - (c))) | (x[2] >> (c));
1801 z[3] = (x[4] << (32U - (c))) | (x[3] >> (c));
1802 z[4] = (x[5] << (32U - (c))) | (x[4] >> (c));
1803 z[5] = (x[6] << (32U - (c))) | (x[5] >> (c));
1804 z[6] = (x[7] << (32U - (c))) | (x[6] >> (c));
1805
1806 if (N_wordlen == 18U)
1807 {
1808 z[7] = (x[8] << (32U - (c))) | (x[7] >> (c));
1809 z[8] = (x[9] << (32U - (c))) | (x[8] >> (c));
1810 z[9] = (x[10] << (32U - (c))) | (x[9] >> (c));
1811 z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
1812 z[11] = (x[12] << (32U - (c))) | (x[11] >> (c));
1813 z[12] = (x[13] << (32U - (c))) | (x[12] >> (c));
1814 z[13] = (x[14] << (32U - (c))) | (x[13] >> (c));
1815 z[14] = (x[15] << (32U - (c))) | (x[14] >> (c));
1816 z[15] = (x[16] << (32U - (c))) | (x[15] >> (c));
1817 z[16] = (x[17] << (32U - (c))) | (x[16] >> (c));
1818 z[17] = (x[17] >> (c));
1819 }
1820
1821 if (N_wordlen == 12U)
1822 {
1823 z[7] = (x[8] << (32U - (c))) | (x[7] >> (c));
1824 z[8] = (x[9] << (32U - (c))) | (x[8] >> (c));
1825 z[9] = (x[10] << (32U - (c))) | (x[9] >> (c));
1826 z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
1827 z[11] = (x[11] >> (c));
1828 }
1829 if (N_wordlen == 8U)
1830 {
1831 z[7] = (x[7] >> (c));
1832 }
1833 }
1834 /* Shift right by 1 <= c <= 31. */
shiftright(uint32_t * z,uint32_t * x,uint32_t c)1835 static void shiftright(uint32_t *z, uint32_t *x, uint32_t c)
1836 {
1837 SET_WORD(&z[0], (GET_WORD(&x[1]) << (32U - (c))) | (GET_WORD(&x[0]) >> (c)));
1838 SET_WORD(&z[1], (GET_WORD(&x[2]) << (32U - (c))) | (GET_WORD(&x[1]) >> (c)));
1839 SET_WORD(&z[2], (GET_WORD(&x[3]) << (32U - (c))) | (GET_WORD(&x[2]) >> (c)));
1840 SET_WORD(&z[3], (GET_WORD(&x[4]) << (32U - (c))) | (GET_WORD(&x[3]) >> (c)));
1841 SET_WORD(&z[4], (GET_WORD(&x[5]) << (32U - (c))) | (GET_WORD(&x[4]) >> (c)));
1842 SET_WORD(&z[5], (GET_WORD(&x[6]) << (32U - (c))) | (GET_WORD(&x[5]) >> (c)));
1843 SET_WORD(&z[6], (GET_WORD(&x[7]) << (32U - (c))) | (GET_WORD(&x[6]) >> (c)));
1844
1845 if (N_wordlen == 18U)
1846 {
1847 SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
1848 SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
1849 SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
1850 SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
1851 SET_WORD(&z[11], (GET_WORD(&x[12]) << (32U - (c))) | (GET_WORD(&x[11]) >> (c)));
1852 SET_WORD(&z[12], (GET_WORD(&x[13]) << (32U - (c))) | (GET_WORD(&x[12]) >> (c)));
1853 SET_WORD(&z[13], (GET_WORD(&x[14]) << (32U - (c))) | (GET_WORD(&x[13]) >> (c)));
1854 SET_WORD(&z[14], (GET_WORD(&x[15]) << (32U - (c))) | (GET_WORD(&x[14]) >> (c)));
1855 SET_WORD(&z[15], (GET_WORD(&x[16]) << (32U - (c))) | (GET_WORD(&x[15]) >> (c)));
1856 SET_WORD(&z[16], (GET_WORD(&x[17]) << (32U - (c))) | (GET_WORD(&x[16]) >> (c)));
1857 SET_WORD(&z[17], (GET_WORD(&x[17]) >> (c)));
1858 }
1859 if (N_wordlen == 12U)
1860 {
1861 SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
1862 SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
1863 SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
1864 SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
1865 SET_WORD(&z[11], (GET_WORD(&x[11]) >> (c)));
1866 }
1867 if (N_wordlen == 8U)
1868 {
1869 SET_WORD((&z[7]), (GET_WORD(&x[7]) >> (c)));
1870 }
1871 }
1872 /* Shift left by 1 <= c <= 31. */
shiftleft(uint32_t * z,uint32_t * x,uint32_t c)1873 static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c)
1874 {
1875 if (N_wordlen == 18U)
1876 {
1877 SET_WORD(&z[17], (GET_WORD(&x[17]) << (c)) | GET_WORD(&z[16]) >> (32U - (c)));
1878 SET_WORD(&z[16], (GET_WORD(&x[16]) << (c)) | GET_WORD(&z[15]) >> (32U - (c)));
1879 SET_WORD(&z[15], (GET_WORD(&x[15]) << (c)) | GET_WORD(&z[14]) >> (32U - (c)));
1880 SET_WORD(&z[14], (GET_WORD(&x[14]) << (c)) | GET_WORD(&z[13]) >> (32U - (c)));
1881 SET_WORD(&z[13], (GET_WORD(&x[13]) << (c)) | GET_WORD(&z[12]) >> (32U - (c)));
1882 SET_WORD(&z[12], (GET_WORD(&x[12]) << (c)) | GET_WORD(&z[11]) >> (32U - (c)));
1883 SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
1884 SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
1885 SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
1886 SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
1887 }
1888 if (N_wordlen == 12U)
1889 {
1890 SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
1891 SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
1892 SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
1893 SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
1894 }
1895 SET_WORD(&z[7], (GET_WORD(&x[7]) << (c)) | GET_WORD(&z[6]) >> (32U - (c)));
1896 SET_WORD(&z[6], (GET_WORD(&x[6]) << (c)) | GET_WORD(&z[5]) >> (32U - (c)));
1897 SET_WORD(&z[5], (GET_WORD(&x[5]) << (c)) | GET_WORD(&z[4]) >> (32U - (c)));
1898 SET_WORD(&z[4], (GET_WORD(&x[4]) << (c)) | GET_WORD(&z[3]) >> (32U - (c)));
1899 SET_WORD(&z[3], (GET_WORD(&x[3]) << (c)) | GET_WORD(&z[2]) >> (32U - (c)));
1900 SET_WORD(&z[2], (GET_WORD(&x[2]) << (c)) | GET_WORD(&z[1]) >> (32U - (c)));
1901 SET_WORD(&z[1], (GET_WORD(&x[1]) << (c)) | GET_WORD(&z[0]) >> (32U - (c)));
1902 SET_WORD(&z[0], (GET_WORD(&x[0]) << (c)));
1903 }
1904
multiply_casper(uint32_t w_out[],const uint32_t a[],const uint32_t b[])1905 static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[])
1906 {
1907 uint32_t *Np;
1908
1909 if (N_wordlen == 8U)
1910 {
1911 Np = Np256;
1912 MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1913 }
1914 if (N_wordlen == 12U)
1915 {
1916 Np = Np384;
1917 MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1918 }
1919
1920 if (N_wordlen == 18U)
1921 {
1922 Np = Np521;
1923 MultprecCiosMul521_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1924 }
1925 }
1926 /* Convert a projective point (X1 : Y1 : Z1)
1927 * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
1928 * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
1929 */
Jac_toAffine(uint32_t * X3,uint32_t * Y3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1)1930 void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
1931 {
1932 uint32_t *T1, *T2;
1933
1934 T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
1935 T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
1936
1937 square_casper(T1, Z1); // Z^2
1938 multiply_casper(T2, T1, Z1); // Z^3
1939
1940 // Montgomery inverse
1941 if (N_wordlen == 8U)
1942 {
1943 invert_mod_p256(T1, T2);
1944 }
1945
1946 if (N_wordlen == 12U)
1947 {
1948 invert_mod_p384(T1, T2);
1949 }
1950
1951 if (N_wordlen == 18U)
1952 {
1953 invert_mod_p521(T1, T2);
1954 }
1955
1956 multiply_casper(Y3, Y1, T1); // Y3 = Y/Z^3
1957 multiply_casper(T2, T1, Z1); // Z^-2
1958 multiply_casper(X3, X1, T2); // X3 = X/Z^2
1959 }
1960
1961 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
1962 * where (X1: Y1: Z1) != (X2 : Y2 : Z2)
1963 * (X3 : Y3: Z3) may be the same as one of the inputs.
1964 */
Jac_addition(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1,uint32_t * X2,uint32_t * Y2,uint32_t * Z2)1965 void Jac_addition(uint32_t *X3,
1966 uint32_t *Y3,
1967 uint32_t *Z3,
1968 uint32_t *X1,
1969 uint32_t *Y1,
1970 uint32_t *Z1,
1971 uint32_t *X2,
1972 uint32_t *Y2,
1973 uint32_t *Z2)
1974 {
1975 uint32_t *Z1Z1, *Z2Z2, *U1, *S1, *J, *H, *V, *t0, *t1;
1976 int m1, m2;
1977
1978 Z1Z1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
1979 Z2Z2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
1980 U1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
1981 S1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
1982 J = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
1983 H = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
1984 V = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
1985 t0 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
1986 t1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
1987
1988 CASPER_ECC_equal_to_zero(&m1, Z1);
1989 CASPER_ECC_equal_to_zero(&m2, Z2);
1990 if (m1 == 0)
1991 {
1992 CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
1993 CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
1994 CASPER_MEMCPY(Z3, Z2, N_wordlen * 4U);
1995 return;
1996 }
1997 if (m2 == 0)
1998 {
1999 CASPER_MEMCPY(X3, X1, N_wordlen * 4U);
2000 CASPER_MEMCPY(Y3, Y1, N_wordlen * 4U);
2001 CASPER_MEMCPY(Z3, Z1, N_wordlen * 4U);
2002 return;
2003 }
2004
2005 square_casper(Z1Z1, Z1);
2006 square_casper(Z2Z2, Z2);
2007 multiply_casper(U1, X1, Z2Z2);
2008 multiply_casper(H, X2, Z1Z1); /* if H equals U1 then X's are the same */
2009 multiply_casper(t0, Z2, Z2Z2);
2010 multiply_casper(S1, Y1, t0);
2011 multiply_casper(t0, Z1, Z1Z1);
2012 multiply_casper(J, Y2, t0); /* if (S1 == J) then Y's are the same */
2013
2014 CASPER_ECC_equal(&m1, H, U1); /* If H and U1 match then the X-coordinates are the same. */
2015 CASPER_ECC_equal(&m2, S1, J); /* If S1 and J match then the Y-coordinates are the same. */
2016 if (m1 == 0)
2017 {
2018 if (m2 == 0)
2019 {
2020 Jac_double(X3, Y3, Z3, X1, Y1, Z1);
2021 return;
2022 }
2023 /* else {
2024 We work with the point at infinity.
2025 The Z-coordinate will be set to zero in this function.
2026 } */
2027 }
2028
2029 sub_casper(H, H, U1);
2030 mul2_casper(t0, H);
2031 square_casper(t1, t0);
2032 sub_casper(t0, J, S1);
2033 multiply_casper(J, H, t1);
2034 multiply_casper(V, U1, t1);
2035 mul2_casper(U1, t0);
2036 square_casper(t0, U1);
2037 mul2_casper(t1, V);
2038 sub_casper(t0, t0, J);
2039 sub_casper(X3, t0, t1);
2040 sub_casper(t0, V, X3);
2041 multiply_casper(t1, S1, J);
2042 mul2_casper(t1, t1);
2043 multiply_casper(V, U1, t0);
2044 sub_casper(Y3, V, t1);
2045 add_casper(V, Z1, Z2);
2046 square_casper(t1, V);
2047 sub_casper(t1, t1, Z1Z1);
2048 sub_casper(t1, t1, Z2Z2);
2049 multiply_casper(Z3, t1, H);
2050 }
2051
2052 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
2053 * where (X1: Y1: Z1) != (X2, Y2)
2054 * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
2055 * Source: 2004 Hankerson?Menezes?Vanstone, page 91.
2056 */
Jac_add_affine(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1,uint32_t * X2,uint32_t * Y2)2057 void Jac_add_affine(
2058 uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2)
2059 {
2060 uint32_t *T1, *T2, *T3, *T4, *T5;
2061 uint32_t *ONE = NULL;
2062 int m1, m2;
2063
2064 if (N_wordlen == 8U)
2065 {
2066 ONE = NISTr256;
2067 }
2068 if (N_wordlen == 12U)
2069 {
2070 ONE = NISTr384;
2071 }
2072 if (N_wordlen == 18U)
2073 {
2074 ONE = NISTr521;
2075 }
2076
2077 T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
2078 T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
2079 T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
2080 T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
2081 T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
2082
2083 CASPER_ECC_equal_to_zero(&m1, Z1);
2084 if (m1 == 0)
2085 {
2086 CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
2087 CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
2088 CASPER_MEMCPY(Z3, ONE, N_wordlen * 4U);
2089 return;
2090 }
2091
2092 CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
2093 square_casper(T3, Z1);
2094 multiply_casper(T2, T3, Z1);
2095 multiply_casper(T4, T3, X2);
2096 multiply_casper(T3, T2, Y2);
2097
2098 CASPER_ECC_equal(&m1, T4, X1);
2099 CASPER_ECC_equal(&m2, T3, Y1);
2100 if (m1 == 0)
2101 {
2102 if (m2 == 0)
2103 {
2104 Jac_double(X3, Y3, Z3, X1, Y1, Z1);
2105 return;
2106 }
2107 /* else {
2108 We work with the point at infinity.
2109 The Z-coordinate will be set to zero in this function.
2110 } */
2111 }
2112
2113 sub_casper(T1, T4, X1);
2114 sub_casper(T2, T3, Y1);
2115 multiply_casper(Z3, T5, T1);
2116 square_casper(T3, T1);
2117 multiply_casper(T4, T3, T1);
2118 multiply_casper(T5, T3, X1);
2119 mul2_casper(T1, T5);
2120 square_casper(X3, T2);
2121 sub_casper(X3, X3, T1);
2122 sub_casper(X3, X3, T4);
2123 sub_casper(T3, T5, X3);
2124 multiply_casper(T1, T3, T2);
2125 multiply_casper(T2, T4, Y1);
2126 sub_casper(Y3, T1, T2);
2127 }
2128
2129 static uint32_t casper_get_word(uint32_t *addr);
2130
2131 /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
2132 * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
2133 * (X3 : Y3: Z3) may be the same as the input.
2134 */
Jac_double(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1)2135 void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
2136 {
2137 uint32_t *T1, *T2, *T3, *T4, *T5;
2138
2139 T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
2140 T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
2141 T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
2142 T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
2143 T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
2144
2145 square_casper(T1, Z1);
2146 sub_casper(T3, X1, T1);
2147 add_casper(T1, X1, T1);
2148 multiply_casper(T4, T3, T1);
2149
2150 mul2_casper(T3, T4);
2151
2152 add_casper(T2, T3, T4);
2153
2154 mul2_casper(Y3, Y1);
2155
2156 CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
2157 multiply_casper(Z3, Y3, T5);
2158
2159 square_casper(T5, Y3);
2160
2161 multiply_casper(T3, T5, X1);
2162
2163 square_casper(Y3, T5);
2164
2165 half(T5, Y3, T4);
2166
2167 square_casper(X3, T2);
2168
2169 mul2_casper(T1, T3);
2170
2171 sub_casper(X3, X3, T1);
2172
2173 sub_casper(T1, T3, X3);
2174
2175 multiply_casper(T3, T1, T2);
2176
2177 sub_casper(Y3, T3, T5);
2178 }
2179
2180 /* Recoding for a signed fixed window.
2181 * Source: https://eprint.iacr.org/2014/130.pdf, Algorithm 6
2182 * Recode the n-bit integer k into ciel(log2(n)/(w-1)) digits
2183 * where each digit is in
2184 * { +/- 1, +/- 3, ..., +/- 2^(w-1)-1 }
2185 * and put the result in c.
2186 */
recode(int8_t * c,uint32_t * k,int n,int w)2187 static void recode(int8_t *c, uint32_t *k, int n, int w)
2188 {
2189 int i, t;
2190 uint32_t K[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
2191 (void)memcpy(K, k, (size_t)ceil(((double)n / 8.)));
2192 t = (n + (w - 2)) / (w - 1);
2193 for (i = 0; i < t; i++)
2194 {
2195 c[i] = (int8_t)(uint8_t)((K[0] & ((uint32_t)(uint32_t)(1UL << (uint32_t)w) - 1UL)) -
2196 (uint32_t)(uint32_t)(1UL << ((uint32_t)w - 1UL)));
2197 shiftrightSysram(K, K, (unsigned)w - 1U);
2198 (void)add_n_1(K, K, (uint32_t)c[i] >> 31, (int16_t)(uint16_t)N_wordlen);
2199 }
2200 c[t] = (int8_t)K[0];
2201 }
2202
sub_n(uint32_t * c,uint32_t * a,uint32_t * b,int n)2203 static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n)
2204 {
2205 int i;
2206 uint32_t borrow;
2207 sub_borrowout(borrow, GET_WORD(&c[0]), a[0], GET_WORD(&b[0]));
2208 for (i = 1; i < n; i++)
2209 {
2210 sub_borrowin_borrowout(borrow, GET_WORD(&c[i]), a[i], GET_WORD(&b[i]), borrow);
2211 }
2212 return borrow;
2213 }
2214
2215 #if 0
2216 /* Dumb n-limb subtraction of c=a-b, return borrow. */
2217 static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n) {
2218 int i;
2219 uint32_t borrow;
2220 sub_borrowout(borrow, c[0], a[0], b);
2221 for (i = 1; i < n; i++) {
2222 sub_borrowin_borrowout_1(borrow, c[i], a[i], borrow);
2223 }
2224 return borrow;
2225 }
2226
2227 /* Dumb n-limb addition of c=a+b, return carry. */
2228 static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n) {
2229 int i;
2230 uint32_t carry;
2231 add_cout(carry, c[0], a[0], b[0]);
2232 for (i = 1; i < n; i++) {
2233 add_cout_cin(carry, c[i], a[i], b[i], carry);
2234 }
2235 return carry;
2236 }
2237 #endif
2238
2239 /* Dumb n-limb addition of c=a+b, return carry. */
add_n_1(uint32_t * c,uint32_t * a,uint32_t b,int n)2240 static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n)
2241 {
2242 int i;
2243 uint32_t carry;
2244 add_cout(carry, c[0], a[0], b);
2245 for (i = 1; i < n; i++)
2246 {
2247 add_cout_cin(carry, c[i], a[i], 0U, carry);
2248 }
2249 return carry;
2250 }
2251
int8abs(int8_t v)2252 static uint8_t int8abs(int8_t v)
2253 {
2254 return ((v < 0) ? ((uint8_t)-v) : ((uint8_t)v));
2255 }
2256
2257 /* Constant time elliptic curve scalar multiplication.
2258 * Source: https://eprint.iacr.org/2014/130.pdf
2259 * when using w = 4.
2260 * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
2261 * p is the prime used to define the finite field F_p
2262 * q is the (prime) order of the curve
2263 */
Jac_scalar_multiplication(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * k,uint32_t * p,uint32_t * q)2264 void Jac_scalar_multiplication(
2265 uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q)
2266 {
2267 uint32_t *scalar, *M, *X, *Y, *Z, *mem_loc;
2268 uint32_t *ONE = NULL;
2269 int i, sign, odd;
2270 uint8_t index;
2271 size_t recodeLength = 175u;
2272 size_t bitlen = 0u;
2273 int8_t rec[CASPER_RECODE_LENGTH_MAX] = {0};
2274
2275 if (N_wordlen == 8U)
2276 {
2277 recodeLength = (size_t)kCASPER_ECC_P256_recode_len;
2278 bitlen = (size_t)kCASPER_ECC_P256_N_bitlen;
2279 ONE = NISTr256;
2280 }
2281
2282 if (N_wordlen == 12U)
2283 {
2284 recodeLength = (size_t)kCASPER_ECC_P384_recode_len;
2285 bitlen = (size_t)kCASPER_ECC_P384_N_bitlen;
2286 ONE = NISTr384;
2287 }
2288
2289 if (N_wordlen == 18U)
2290 {
2291 recodeLength = (size_t)kCASPER_ECC_P521_recode_len;
2292 bitlen = (size_t)521U;
2293 ONE = NISTr521;
2294 }
2295
2296 /* Point to the start of the LUT table space. */
2297 mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2298
2299 scalar = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * (N_wordlen + 4U)];
2300 X = &CASPER_MEM[(20U * N_wordlen + 80U) + 13U * (N_wordlen + 4U)];
2301 Y = &CASPER_MEM[(20U * N_wordlen + 80U) + 14U * (N_wordlen + 4U)];
2302 Z = &CASPER_MEM[(20U * N_wordlen + 80U) + 15U * (N_wordlen + 4U)];
2303 M = &CASPER_MEM[(20U * N_wordlen + 80U) + 16U * (N_wordlen + 4U)];
2304
2305 /* Point to memory the recoded scalar.
2306 */
2307 CASPER_MEMCPY(scalar, k, sizeof(uint32_t) * N_wordlen);
2308
2309 /* Precomputation: compute 1*P, 3*P, 5*P, and 7*P */
2310 #define FSL_CASPER_LUT(P, x) (mem_loc + (3U * ((P)-1U) / 2U + (x)) * (N_wordlen + 4U))
2311
2312 /* Set 1*P */
2313 CASPER_MEMCPY(Z3, ONE, N_wordlen * sizeof(uint32_t));
2314 CASPER_MEMCPY(FSL_CASPER_LUT(1U, 0U), X1, N_wordlen * sizeof(uint32_t));
2315 CASPER_MEMCPY(FSL_CASPER_LUT(1U, 1U), Y1, N_wordlen * sizeof(uint32_t));
2316 CASPER_MEMCPY(FSL_CASPER_LUT(1U, 2U), Z3, N_wordlen * sizeof(uint32_t));
2317
2318 /* Compute 2*P */
2319 Jac_double(X3, Y3, Z3, X1, Y1, Z3);
2320
2321 /* Compute 3*P = 2P + P */
2322 Jac_add_affine(FSL_CASPER_LUT(3U, 0U), FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3, X1, Y1);
2323
2324 /* Compute 5*P = 3P + 2P */
2325 Jac_addition(FSL_CASPER_LUT(5U, 0U), FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), FSL_CASPER_LUT(3U, 0U),
2326 FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3);
2327
2328 /* Compute 7*P = 5P + 2P */
2329 Jac_addition(FSL_CASPER_LUT(7U, 0U), FSL_CASPER_LUT(7U, 1U), FSL_CASPER_LUT(7U, 2U), FSL_CASPER_LUT(5U, 0U),
2330 FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), X3, Y3, Z3);
2331
2332 /* Recode the scalar */
2333 odd = (int32_t)((uint32_t)(casper_get_word(&scalar[0]) & 1U));
2334 (void)sub_n(M, q, scalar, (int16_t)(uint16_t)N_wordlen); // todo!!!
2335 casper_select(scalar, M, scalar, odd, (int16_t)(uint16_t)N_wordlen);
2336
2337 /* Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits */
2338 uint32_t scalarSysram[CASPER_MAX_ECC_SIZE_WORDLEN];
2339 CASPER_MEMCPY(scalarSysram, scalar, /*CASPER_*/ N_wordlen * sizeof(uint32_t));
2340 recode(rec, scalarSysram, (int32_t)bitlen, 4);
2341
2342 /* Set the first value. */
2343 index = int8abs(rec[recodeLength - 1U]);
2344 sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[recodeLength - 1U]) >> 7);
2345
2346 CASPER_MEMCPY(X3, FSL_CASPER_LUT((uint32_t)index, 0U), N_wordlen * sizeof(uint32_t));
2347 CASPER_MEMCPY(Y3, FSL_CASPER_LUT((uint32_t)index, 1U), N_wordlen * sizeof(uint32_t));
2348 CASPER_MEMCPY(Z3, FSL_CASPER_LUT((uint32_t)index, 2U), N_wordlen * sizeof(uint32_t));
2349
2350 /* Get the correct LUT element in constant time by touching
2351 * all elements and masking out the correct one.
2352 */
2353
2354 #define GET_LUT(x, y, z, index) \
2355 do \
2356 { \
2357 int m; \
2358 CASPER_MEMCPY((x), FSL_CASPER_LUT(1U, 0U), N_wordlen * sizeof(uint32_t)); \
2359 CASPER_MEMCPY((y), FSL_CASPER_LUT(1U, 1U), N_wordlen * sizeof(uint32_t)); \
2360 CASPER_MEMCPY((z), FSL_CASPER_LUT(1U, 2U), N_wordlen * sizeof(uint32_t)); \
2361 m = (int32_t)((index) == 3U); \
2362 casper_select((x), (x), FSL_CASPER_LUT(3U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2363 casper_select((y), (y), FSL_CASPER_LUT(3U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2364 casper_select((z), (z), FSL_CASPER_LUT(3U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2365 m = (int32_t)((index) == 5U); \
2366 casper_select((x), (x), FSL_CASPER_LUT(5U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2367 casper_select((y), (y), FSL_CASPER_LUT(5U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2368 casper_select((z), (z), FSL_CASPER_LUT(5U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2369 m = (int32_t)((index) == 7U); \
2370 casper_select((x), (x), FSL_CASPER_LUT(7U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2371 casper_select((y), (y), FSL_CASPER_LUT(7U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2372 casper_select((z), (z), FSL_CASPER_LUT(7U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2373 } while (false)
2374
2375 GET_LUT(X3, Y3, Z3, index);
2376
2377 /* Compute -y and select the positive or negative point. */
2378 (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
2379 casper_select(Y3, Y3, M, sign, (int16_t)(uint16_t)N_wordlen);
2380
2381 for (i = (int)(uint32_t)(recodeLength - 2U); i >= 0; i--)
2382 {
2383 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2384 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2385 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2386
2387 index = int8abs(rec[i]);
2388 sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[i]) >> 7);
2389
2390 GET_LUT(X, Y, Z, index);
2391
2392 /* Compute -y and select the positive or negative point. */
2393 (void)sub_n(scalar, p, Y, (int16_t)(uint16_t)N_wordlen); // todo!!!
2394 casper_select(scalar, Y, scalar, sign, (int16_t)(uint16_t)N_wordlen);
2395
2396 Jac_addition(X3, Y3, Z3, X3, Y3, Z3, X, scalar, Z);
2397 }
2398
2399 (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
2400
2401 casper_select(Y3, M, Y3, odd, (int16_t)(uint16_t)N_wordlen);
2402 }
2403
2404 #undef FSL_CASPER_LUT
2405 #undef GET_LUT
2406
2407 /*
2408 * Pre-compute the following 16 points:
2409 * 00 00 = 0*P + 0*Q <-- Not needed when using sliding windows
2410 * 00 01 = 0*P + 1*Q <-- Not needed when using sliding windows
2411 * 00 10 = 0*P + 2*Q
2412 * 00 11 = 0*P + 3*Q
2413 *
2414 * 01 00 = 1*P + 0*Q <-- Not needed when using sliding windows
2415 * 01 01 = 1*P + 1*Q <-- Not needed when using sliding windows
2416 * 01 10 = 1*P + 2*Q
2417 * 01 11 = 1*P + 3*Q
2418 *
2419 * 10 00 = 2*P + 0*Q
2420 * 10 01 = 2*P + 1*Q
2421 * 10 10 = 2*P + 2*Q
2422 * 10 11 = 2*P + 3*Q
2423 *
2424 * 11 00 = 3*P + 0*Q
2425 * 11 01 = 3*P + 1*Q
2426 * 11 10 = 3*P + 2*Q
2427 * 11 11 = 3*P + 3*Q
2428 *
2429 * index = (bitsi||bitsj)-2 - (biti != 0)*2
2430 *
2431 * Input: P = (X1 : Y1 : Z1) and
2432 * Q = (X2 : Y2 : Z2)
2433 * Output: mem_loc, memory location for the LUT.
2434 */
2435
precompute_double_scalar_LUT16(uint32_t * Px,uint32_t * Py,uint32_t * Qx,uint32_t * Qy)2436 static void precompute_double_scalar_LUT16(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
2437 {
2438 uint32_t *Q2x, *Q2y, *Q2z, *P2x, *P2y, *P2z, *Z, *mem_loc;
2439 uint32_t *ONE = NULL;
2440 uint32_t index = 0;
2441
2442 if (N_wordlen == 8U)
2443 {
2444 ONE = NISTr256;
2445 }
2446
2447 if (N_wordlen == 12U)
2448 {
2449 ONE = NISTr384;
2450 }
2451
2452 Q2x = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 0U * (N_wordlen + 4U)];
2453 Q2y = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
2454 Q2z = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
2455
2456 /* Re-use memory from different scratch space since no
2457 * projective point addition is used below. */
2458 P2x = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
2459 P2z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
2460 P2y = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
2461 Z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
2462
2463 mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2464
2465 CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
2466
2467 // 00 10 = 0*P + 2*Q
2468 Jac_double(Q2x, Q2y, Q2z, Qx, Qy, Z);
2469 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2470 index += N_wordlen;
2471 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2472 index += N_wordlen;
2473 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2474 index += N_wordlen;
2475
2476 // 00 11 = 0*P + 3*Q
2477 Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Qx, Qy);
2478 CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2479 index += N_wordlen;
2480 CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2481 index += N_wordlen;
2482 CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2483 index += N_wordlen;
2484
2485 // 01 10 = 1*P + 2*Q
2486 Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Px, Py);
2487 CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2488 index += N_wordlen;
2489 CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2490 index += N_wordlen;
2491 CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2492 index += N_wordlen;
2493
2494 // 01 11 = 1*P + 3*Q
2495 Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Qx, Qy);
2496 CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2497 index += N_wordlen;
2498 CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2499 index += N_wordlen;
2500 CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2501 index += N_wordlen;
2502
2503 // 10 00 = 2*P + 0*Q
2504 Jac_double(P2x, P2y, P2z, Px, Py, Z);
2505 CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2506 index += N_wordlen;
2507 CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2508 index += N_wordlen;
2509 CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2510 index += N_wordlen;
2511
2512 // 10 01 = 2*P + 1*Q
2513 Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
2514 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2515 index += N_wordlen;
2516 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2517 index += N_wordlen;
2518 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2519 index += N_wordlen;
2520
2521 // 10 10 = 2*P + 2*Q
2522 Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2523 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2524 index += N_wordlen;
2525 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2526 index += N_wordlen;
2527 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2528 index += N_wordlen;
2529
2530 // 10 11 = 2*P + 3*Q
2531 Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2532 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2533 index += N_wordlen;
2534 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2535 index += N_wordlen;
2536 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2537 index += N_wordlen;
2538
2539 // 11 00 = 3*P + 0*Q
2540 Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Px, Py);
2541 CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2542 index += N_wordlen;
2543 CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2544 index += N_wordlen;
2545 CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2546 index += N_wordlen;
2547
2548 // 11 01 = 3*P + 1*Q
2549 Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
2550 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2551 index += N_wordlen;
2552 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2553 index += N_wordlen;
2554 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2555 index += N_wordlen;
2556
2557 // 11 10 = 3*P + 2*Q
2558 Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2559 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2560 index += N_wordlen;
2561 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2562 index += N_wordlen;
2563 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2564 index += N_wordlen;
2565
2566 // 11 11 = 3*P + 3*Q
2567 Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2568 CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2569 index += N_wordlen;
2570 CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2571 index += N_wordlen;
2572 CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2573 index += N_wordlen;
2574 }
2575
2576 /*
2577 * Pre-compute the following 4 points:
2578 * 0 0 = 0*P + 0*Q <-- Not needed when using sliding windows
2579 * 0 1 = 0*P + 1*Q
2580 *
2581 * 1 0 = 1*P + 0*Q
2582 * 1 1 = 1*P + 1*Q
2583 *
2584 * index = (bitsj+1) & (0-bitsi)
2585 *
2586 * Input: P = (X1 : Y1 : Z1) and
2587 * Q = (X2 : Y2 : Z2)
2588 * Output: mem_loc, memory location for the LUT.
2589 */
2590
precompute_double_scalar_LUT4(uint32_t * Px,uint32_t * Py,uint32_t * Qx,uint32_t * Qy)2591 static void precompute_double_scalar_LUT4(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
2592 {
2593 uint32_t *Z, *mem_loc, *ONE;
2594 uint32_t index = 0;
2595
2596 ONE = NISTr521;
2597
2598 /* Re-use memory from different scratch space since no
2599 * projective point addition is used below. */
2600 Z = &CASPER_MEM[(11U * N_wordlen + 4U) + 5U * (N_wordlen + 4U)];
2601 mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2602
2603 CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
2604
2605 // 0*P + 1*Q
2606 CASPER_MEMCPY(&mem_loc[index], Qx, N_wordlen * sizeof(uint32_t));
2607 index += N_wordlen;
2608 CASPER_MEMCPY(&mem_loc[index], Qy, N_wordlen * sizeof(uint32_t));
2609 index += N_wordlen;
2610 CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
2611 index += N_wordlen;
2612
2613 // 1*P + 0*Q
2614 CASPER_MEMCPY(&mem_loc[index], Px, N_wordlen * sizeof(uint32_t));
2615 index += N_wordlen;
2616 CASPER_MEMCPY(&mem_loc[index], Py, N_wordlen * sizeof(uint32_t));
2617 index += N_wordlen;
2618 CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
2619 index += N_wordlen;
2620
2621 // 1*P + 1*Q
2622 Jac_add_affine(&mem_loc[index], &mem_loc[index + N_wordlen], &mem_loc[index + 2U * N_wordlen], Px, Py, Z, Qx, Qy);
2623 }
2624
2625 #define GETLUTX(x) (3U * (x)*N_wordlen)
2626 #define GETLUTY(x) (3U * (x)*N_wordlen + 1U * N_wordlen)
2627 #define GETLUTZ(x) (3U * (x)*N_wordlen + 2U * N_wordlen)
2628
2629 /* Compute the double scalar multiplication
2630 * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
2631 * Using Shamir's trick and precomputing 16 points.
2632 * This code is *not* constant time since this is used
2633 * for verification only.
2634 */
double_scalar_multiplication(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * k1,uint32_t * X2,uint32_t * Y2,uint32_t * k2)2635 void double_scalar_multiplication(uint32_t *X3,
2636 uint32_t *Y3,
2637 uint32_t *Z3,
2638 uint32_t *X1,
2639 uint32_t *Y1,
2640 uint32_t *k1,
2641 uint32_t *X2,
2642 uint32_t *Y2,
2643 uint32_t *k2)
2644 {
2645 uint32_t index = 0, c = 0;
2646 uint32_t *p1 = NULL, *p2 = NULL, x1, x2, *lut, *Tx = NULL, *Ty = NULL, *Tz = NULL;
2647 size_t bitlen, shiftr, shiftl = 0u;
2648
2649 if (N_wordlen == 8U)
2650 {
2651 bitlen = (size_t)kCASPER_ECC_P256_N_bitlen;
2652 precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
2653 shiftr = 30U;
2654 shiftl = 2U;
2655 }
2656
2657 if (N_wordlen == 12U)
2658 {
2659 bitlen = (size_t)kCASPER_ECC_P384_N_bitlen;
2660 precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
2661 shiftr = 30U;
2662 shiftl = 2U;
2663 }
2664
2665 if (N_wordlen == 18U)
2666 {
2667 bitlen = (size_t)kCASPER_ECC_P521_N_bitlen;
2668 precompute_double_scalar_LUT4(X1, Y1, X2, Y2);
2669 shiftr = 31U;
2670 shiftl = 1U;
2671 }
2672
2673 lut = &CASPER_MEM[(20U * N_wordlen + 80U)];
2674
2675 if (N_wordlen == 8U || N_wordlen == 12U)
2676 {
2677 p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen];
2678 p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
2679
2680 Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
2681 Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 3U * (N_wordlen + 4U)];
2682 Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 4U * (N_wordlen + 4U)];
2683 }
2684
2685 if (N_wordlen == 18U)
2686 {
2687 p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen];
2688 p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 1U * (N_wordlen + 4U)];
2689
2690 Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 2U * (N_wordlen + 4U)];
2691 Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 3U * (N_wordlen + 4U)];
2692 Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 4U * (N_wordlen + 4U)];
2693 }
2694
2695 CASPER_MEMCPY(p1, k1, sizeof(uint32_t) * N_wordlen);
2696 CASPER_MEMCPY(p2, k2, sizeof(uint32_t) * N_wordlen);
2697
2698 /* Check if we can slide. */
2699 while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U && c < bitlen)
2700 {
2701 shiftleft(p1, p1, 1U);
2702 shiftleft(p2, p2, 1U);
2703 c++;
2704 /* No doubling needed. */
2705 }
2706
2707 /* Set the first value. */
2708 x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
2709 x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
2710 if (N_wordlen == 8U || N_wordlen == 12U)
2711 {
2712 index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
2713 }
2714
2715 if (N_wordlen == 18U)
2716 {
2717 index = (((x2) + 1U) & (0U - (x1)));
2718 }
2719 shiftleft(p1, p1, shiftl);
2720 shiftleft(p2, p2, shiftl);
2721
2722 CASPER_MEMCPY(X3, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
2723 CASPER_MEMCPY(Y3, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
2724 CASPER_MEMCPY(Z3, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
2725 c += shiftl;
2726
2727 // todo: create an is_zero function
2728
2729 while ((casper_get_word(&p1[0]) | casper_get_word(&p1[1]) | casper_get_word(&p1[2]) | casper_get_word(&p1[3]) |
2730 casper_get_word(&p1[4]) | casper_get_word(&p1[5]) | casper_get_word(&p1[6]) | casper_get_word(&p1[7]) |
2731 casper_get_word(&p1[8]) | casper_get_word(&p1[9]) | casper_get_word(&p1[10]) | casper_get_word(&p1[11]) |
2732 casper_get_word(&p1[12]) | casper_get_word(&p1[13]) | casper_get_word(&p1[14]) | casper_get_word(&p1[15]) |
2733 casper_get_word(&p1[16]) | casper_get_word(&p1[17]) | casper_get_word(&p2[0]) | casper_get_word(&p2[1]) |
2734 casper_get_word(&p2[2]) | casper_get_word(&p2[3]) | casper_get_word(&p2[4]) | casper_get_word(&p2[5]) |
2735 casper_get_word(&p2[6]) | casper_get_word(&p2[7]) | casper_get_word(&p2[8]) | casper_get_word(&p2[9]) |
2736 casper_get_word(&p2[10]) | casper_get_word(&p2[11]) | casper_get_word(&p2[12]) | casper_get_word(&p2[13]) |
2737 casper_get_word(&p2[14]) | casper_get_word(&p2[15]) | casper_get_word(&p2[16]) |
2738 casper_get_word(&p2[17])) != 0U)
2739 {
2740 /* Check if we can slide. */
2741 while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U &&
2742 c < bitlen)
2743 {
2744 shiftleft(p1, p1, 1U);
2745 shiftleft(p2, p2, 1U);
2746 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2747 c++;
2748 }
2749
2750 if (c >= (bitlen - 1U))
2751 {
2752 break;
2753 }
2754
2755 for (uint32_t i = 0; i < shiftl; i++)
2756 {
2757 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2758 }
2759
2760 x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
2761 x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
2762
2763 if (N_wordlen == 8U || N_wordlen == 12U)
2764 {
2765 index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
2766 }
2767
2768 if (N_wordlen == 18U)
2769 {
2770 index = (((x2) + 1U) & (0U - (x1)));
2771 }
2772
2773 shiftleft(p1, p1, shiftl);
2774 shiftleft(p2, p2, shiftl);
2775
2776 CASPER_MEMCPY(Tx, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
2777 CASPER_MEMCPY(Ty, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
2778 CASPER_MEMCPY(Tz, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
2779
2780 Jac_addition(X3, Y3, Z3, X3, Y3, Z3, Tx, Ty,
2781 Tz); //&lut[GETLUTX(index)], &lut[GETLUTY(index)], &lut[GETLUTZ(index)]);
2782 c += shiftl;
2783 }
2784
2785 /* Special case in the end. */
2786 if (c == (bitlen - 1U))
2787 {
2788 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2789 x1 = casper_get_word(&p1[N_wordlen - 1U]) >> 31;
2790 x2 = casper_get_word(&p2[N_wordlen - 1U]) >> 31;
2791 if (0U != x1)
2792 {
2793 Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X1, Y1);
2794 }
2795 if (x2 != 0U)
2796 {
2797 Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X2, Y2);
2798 }
2799 c++;
2800 }
2801
2802 while (c < bitlen)
2803 {
2804 Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2805 c++;
2806 }
2807 }
2808
invert_mod_p256(uint32_t * c,uint32_t * a)2809 static void invert_mod_p256(uint32_t *c, uint32_t *a)
2810 {
2811 int i;
2812 uint32_t *t, *t2, *s1, *s2, *s4, *s8, *tmp;
2813
2814 /* Assuming it is safe to use the ECC scratch size. */
2815 t = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2816 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2817 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2818 t2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2819 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2820 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2821 s1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2822 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2823 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2824 s2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2825 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2826 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2827 s4 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2828 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2829 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2830 s8 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2831 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2832 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2833 tmp = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2834 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2835 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2836
2837 // t2 = n^(2^1)*n # 11
2838 square_casper(tmp, a);
2839 multiply_casper(t2, tmp, a);
2840
2841 // s1 = t2^(2^2)*t2 # F
2842 square_casper(s1, t2);
2843 square_casper(tmp, s1);
2844 multiply_casper(s1, tmp, t2);
2845
2846 // s2 = s1^(2^4)*s1 # FF
2847 square_casper(s2, s1);
2848 // for (i = 1; i < 4; i++) square(s2, s2);
2849 square_casper(tmp, s2);
2850 square_casper(s2, tmp);
2851 square_casper(tmp, s2);
2852 multiply_casper(s2, tmp, s1);
2853
2854 // s4 = s2^(2^8)*s2 # FFFF
2855 square_casper(s4, s2);
2856 for (i = 1; i < 7; i += 2)
2857 {
2858 square_casper(tmp, s4);
2859 square_casper(s4, tmp);
2860 }
2861 square_casper(tmp, s4);
2862 multiply_casper(s4, tmp, s2);
2863
2864 // s8 = s4^(2^16)*s4 # FFFFFFFF
2865 square_casper(s8, s4);
2866 for (i = 1; i < 15; i += 2)
2867 {
2868 square_casper(tmp, s8);
2869 square_casper(s8, tmp);
2870 }
2871 square_casper(tmp, s8);
2872 multiply_casper(s8, tmp, s4);
2873
2874 // t = s8^(2^32)*n # ffffffff00000001
2875 square_casper(tmp, s8);
2876 for (i = 1; i < 31; i += 2)
2877 {
2878 square_casper(t, tmp);
2879 square_casper(tmp, t);
2880 }
2881 square_casper(t, tmp);
2882 multiply_casper(tmp, t, a);
2883
2884 // t = t^(2^128)*s8 # ffffffff00000001000000000000000000000000ffffffff
2885 for (i = 0; i < 128; i += 2)
2886 {
2887 square_casper(t, tmp);
2888 square_casper(tmp, t);
2889 }
2890 multiply_casper(t, tmp, s8);
2891
2892 // t = t^(2^32)*s8 # ffffffff00000001000000000000000000000000ffffffffffffffff
2893 for (i = 0; i < 32; i += 2)
2894 {
2895 square_casper(tmp, t);
2896 square_casper(t, tmp);
2897 }
2898 multiply_casper(tmp, t, s8);
2899
2900 // t = t^(2^16)*s4 # ffffffff00000001000000000000000000000000ffffffffffffffffffff
2901 for (i = 0; i < 16; i += 2)
2902 {
2903 square_casper(t, tmp);
2904 square_casper(tmp, t);
2905 }
2906 multiply_casper(t, tmp, s4);
2907
2908 // t = t^(2^8)*s2 # ffffffff00000001000000000000000000000000ffffffffffffffffffffff
2909 for (i = 0; i < 8; i += 2)
2910 {
2911 square_casper(tmp, t);
2912 square_casper(t, tmp);
2913 }
2914 multiply_casper(tmp, t, s2);
2915
2916 // t = t^(2^4)*s1 # ffffffff00000001000000000000000000000000fffffffffffffffffffffff
2917 for (i = 0; i < 4; i += 2)
2918 {
2919 square_casper(t, tmp);
2920 square_casper(tmp, t);
2921 }
2922 multiply_casper(t, tmp, s1);
2923
2924 // t = t^(2^2)*t2
2925 square_casper(tmp, t);
2926 square_casper(t, tmp);
2927 multiply_casper(tmp, t, t2);
2928
2929 // t = t^(2^2)*n # ffffffff00000001000000000000000000000000fffffffffffffffffffffffd
2930 square_casper(t, tmp);
2931 square_casper(tmp, t);
2932 multiply_casper(c, tmp, a);
2933 }
2934
2935 // A and C do not need to be in Casper memory
toMontgomery_ECC_P256(uint32_t * C,uint32_t * A)2936 static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A)
2937 {
2938 /* R^2 = 2^512 mod p, used to convert values to Montgomery form. */
2939 uint32_t R2[kCASPER_ECC_P256_wordlen] = {0x00000003, 0x00000000, 0xffffffffU, 0xfffffffbU,
2940 0xfffffffeU, 0xffffffffU, 0xfffffffdU, 0x4};
2941 uint32_t *T1, *T2, *T3;
2942 T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2943 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2944 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2945 T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2946 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2947 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2948 T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2949 (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2950 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2951
2952 CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2953 CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2954
2955 multiply_casper(T3, T2, T1);
2956 CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2957 }
2958
2959 /* Compute inversion modulo NIST-p384 using Fermats little theorem.
2960 * Using c = a^(p-2) = a^(-1) mod p.
2961 * This computes the modular inversion if all arithmetic is "regular"
2962 * modular arithmetic or computes automatically the Montgomery inverse
2963 * if all arithmetic is Montgomery arithmetic.
2964 */
2965
invert_mod_p384(uint32_t * c,uint32_t * a)2966 static void invert_mod_p384(uint32_t *c, uint32_t *a)
2967 {
2968 int i;
2969 uint32_t *e, *d, *tmp, *t0, *t1, *t2, *t3, *t4, *t5, *t6; // 10 residues needed
2970
2971 /* Assuming it is safe to use the LUT scratch size.
2972 * Hence, do not invert while elements in the LUT are needed.
2973 */
2974 e = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2975 d = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2976 tmp =
2977 &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2978 t0 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2979 t1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2980 t2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2981 t3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2982 t4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2983 t5 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2984 t6 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2985
2986 square_casper(tmp, a); // 2
2987 square_casper(t1, tmp); // 4
2988 square_casper(tmp, t1); // 8
2989 multiply_casper(t2, tmp, t1); // 12
2990 multiply_casper(d, a, t2); // 13
2991 multiply_casper(e, d, a); // 14
2992 multiply_casper(t0, e, a); // 15
2993
2994 // t1 = t0^(2^4)*t0 # ff
2995 square_casper(tmp, t0);
2996 square_casper(t1, tmp);
2997 square_casper(tmp, t1);
2998 square_casper(t2, tmp);
2999 multiply_casper(t1, t2, t0);
3000
3001 // t2 = t1^(2^8)*t1 # 4f
3002 square_casper(tmp, t1);
3003 for (i = 0; i < 3; i++)
3004 {
3005 square_casper(t3, tmp);
3006 square_casper(tmp, t3);
3007 }
3008 square_casper(t3, tmp);
3009 multiply_casper(t2, t3, t1);
3010
3011 // t3 = t2^(2^16)*t2 # 8f
3012 square_casper(tmp, t2);
3013 for (i = 0; i < 7; i++)
3014 {
3015 square_casper(t4, tmp);
3016 square_casper(tmp, t4);
3017 }
3018 square_casper(t4, tmp);
3019 multiply_casper(t3, t4, t2);
3020
3021 // t4 = t3^(2^32)*t3 # 16f
3022 square_casper(tmp, t3);
3023 for (i = 0; i < 15; i++)
3024 {
3025 square_casper(t5, tmp);
3026 square_casper(tmp, t5);
3027 }
3028 square_casper(t5, tmp);
3029 multiply_casper(t4, t5, t3);
3030
3031 // t5 = t4^(2^64)*t4 # 32f
3032 square_casper(tmp, t4);
3033 for (i = 0; i < 31; i++)
3034 {
3035 square_casper(t6, tmp);
3036 square_casper(tmp, t6);
3037 }
3038 square_casper(t6, tmp);
3039 multiply_casper(t5, t6, t4);
3040
3041 // t5 = t5^(2^64)*t4 # 48f
3042 square_casper(tmp, t5);
3043 for (i = 0; i < 31; i++)
3044 {
3045 square_casper(t6, tmp);
3046 square_casper(tmp, t6);
3047 }
3048 square_casper(t6, tmp);
3049 multiply_casper(t5, t6, t4);
3050
3051 // t5 = t5^(2^32)*t3 # 56f
3052 square_casper(tmp, t5);
3053 for (i = 0; i < 15; i++)
3054 {
3055 square_casper(t6, tmp);
3056 square_casper(tmp, t6);
3057 }
3058 square_casper(t6, tmp);
3059 multiply_casper(t5, t6, t3);
3060
3061 // t5 = t5^(2^16)*t2 # 60f
3062 square_casper(tmp, t5);
3063 for (i = 0; i < 7; i++)
3064 {
3065 square_casper(t6, tmp);
3066 square_casper(tmp, t6);
3067 }
3068 square_casper(t6, tmp);
3069 multiply_casper(t5, t6, t2);
3070
3071 // t5 = t5^(2^8)*t1 # 62f
3072 square_casper(tmp, t5);
3073 for (i = 0; i < 3; i++)
3074 {
3075 square_casper(t6, tmp);
3076 square_casper(tmp, t6);
3077 }
3078 square_casper(t6, tmp);
3079 multiply_casper(t5, t6, t1);
3080
3081 // n = t5^(2^4)*t0 # 63f
3082 square_casper(tmp, t5);
3083 for (i = 0; i < 1; i++)
3084 {
3085 square_casper(t6, tmp);
3086 square_casper(tmp, t6);
3087 }
3088 square_casper(t6, tmp);
3089 multiply_casper(t5, t6, t0);
3090
3091 // n = n^(2^4)*e
3092 square_casper(tmp, t5);
3093 for (i = 0; i < 1; i++)
3094 {
3095 square_casper(t6, tmp);
3096 square_casper(tmp, t6);
3097 }
3098 square_casper(t6, tmp);
3099 multiply_casper(t5, t6, e);
3100
3101 // n = n^(2^32)*t3
3102 square_casper(tmp, t5);
3103 for (i = 0; i < 15; i++)
3104 {
3105 square_casper(t6, tmp);
3106 square_casper(tmp, t6);
3107 }
3108 square_casper(t6, tmp);
3109 multiply_casper(t5, t6, t3);
3110
3111 // n = n^(2^64)
3112 square_casper(tmp, t5);
3113 for (i = 0; i < 31; i++)
3114 {
3115 square_casper(t6, tmp);
3116 square_casper(tmp, t6);
3117 }
3118 square_casper(t5, tmp);
3119
3120 // n = n^(2^16)*t2
3121 square_casper(tmp, t5);
3122 for (i = 0; i < 7; i++)
3123 {
3124 square_casper(t6, tmp);
3125 square_casper(tmp, t6);
3126 }
3127 square_casper(t6, tmp);
3128 multiply_casper(t5, t6, t2);
3129
3130 // n = n^(2^8)*t1
3131 square_casper(tmp, t5);
3132 for (i = 0; i < 3; i++)
3133 {
3134 square_casper(t6, tmp);
3135 square_casper(tmp, t6);
3136 }
3137 square_casper(t6, tmp);
3138 multiply_casper(t5, t6, t1);
3139
3140 // n = n^(2^4)*t0
3141 square_casper(tmp, t5);
3142 for (i = 0; i < 1; i++)
3143 {
3144 square_casper(t6, tmp);
3145 square_casper(tmp, t6);
3146 }
3147 square_casper(t6, tmp);
3148 multiply_casper(t5, t6, t0);
3149
3150 // n = n^(2^4)*d
3151 square_casper(tmp, t5);
3152 for (i = 0; i < 1; i++)
3153 {
3154 square_casper(t6, tmp);
3155 square_casper(tmp, t6);
3156 }
3157 square_casper(t6, tmp);
3158 multiply_casper(c, t6, d);
3159 }
3160
3161 // A and C do not need to be in Casper memory
toMontgomery_ECC_P384(uint32_t * C,uint32_t * A)3162 static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A)
3163 {
3164 /* R^2 = 2^768 mod p, used to convert values to Montgomery form. */
3165 uint32_t R2[kCASPER_ECC_P384_wordlen] = {0x00000001, 0xfffffffeU, 0x00000000, 0x00000002, 0x00000000, 0xfffffffeU,
3166 0x00000000, 0x00000002, 0x1, 0x0, 0x0, 0x0};
3167 uint32_t *T1, *T2, *T3;
3168 T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3169 (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3170 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3171 T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3172 (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3173 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3174 T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3175 (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3176 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3177
3178 CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3179 CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3180
3181 multiply_casper(T3, T2, T1);
3182 CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3183 }
3184
invert_mod_p521(uint32_t * c,uint32_t * a)3185 static void invert_mod_p521(uint32_t *c, uint32_t *a)
3186 {
3187 int i;
3188 uint32_t *e3, *d2, *d3, *d4, *T2, *T4; // 6 residues needed
3189
3190 /* Assuming it is safe to use the LUT scratch size.
3191 * Hence, do not invert while elements in the LUT are needed.
3192 */
3193 e3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3194 d2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3195 d3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3196 d4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3197 T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3198 T4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3199
3200 square_casper(d2, a);
3201 multiply_casper(T2, d2, a);
3202
3203 // d3 = 2^2 * T2
3204 square_casper(d3, T2);
3205 square_casper(e3, d3);
3206 multiply_casper(T4, e3, T2);
3207
3208 // d3 = 2^4 * T4
3209 square_casper(d3, T4);
3210 square_casper(e3, d3);
3211 square_casper(d3, e3);
3212 square_casper(e3, d3);
3213 multiply_casper(d2, e3, T4);
3214
3215 // d3 = 2^8 * d2
3216 square_casper(d3, d2);
3217 square_casper(e3, d3);
3218 for (i = 0; i < 3; i++)
3219 {
3220 square_casper(d3, e3);
3221 square_casper(e3, d3);
3222 }
3223 multiply_casper(d4, e3, d2);
3224
3225 // d3 = 2^16 * d2
3226 square_casper(d3, d4);
3227 square_casper(e3, d3);
3228 for (i = 0; i < 7; i++)
3229 {
3230 square_casper(d3, e3);
3231 square_casper(e3, d3);
3232 }
3233 multiply_casper(d2, e3, d4);
3234
3235 // d3 = 2^32 * d2
3236 square_casper(d3, d2);
3237 square_casper(e3, d3);
3238 for (i = 0; i < 15; i++)
3239 {
3240 square_casper(d3, e3);
3241 square_casper(e3, d3);
3242 }
3243 multiply_casper(d4, e3, d2);
3244
3245 // d3 = 2^64 * d2
3246 square_casper(d3, d4);
3247 square_casper(e3, d3);
3248 for (i = 0; i < 31; i++)
3249 {
3250 square_casper(d3, e3);
3251 square_casper(e3, d3);
3252 }
3253 multiply_casper(d2, e3, d4);
3254
3255 // d3 = 2^128 * d2
3256 square_casper(d3, d2);
3257 square_casper(e3, d3);
3258 for (i = 0; i < 63; i++)
3259 {
3260 square_casper(d3, e3);
3261 square_casper(e3, d3);
3262 }
3263 multiply_casper(d4, e3, d2);
3264
3265 // d3 = 2^256 * d2
3266 square_casper(d3, d4);
3267 square_casper(e3, d3);
3268 for (i = 0; i < 127; i++)
3269 {
3270 square_casper(d3, e3);
3271 square_casper(e3, d3);
3272 }
3273 multiply_casper(d2, e3, d4);
3274
3275 // d3 = 2^2 * d2
3276 square_casper(d3, d2);
3277 square_casper(e3, d3);
3278 multiply_casper(d2, e3, T2);
3279
3280 // d3 = 2^4 * d2
3281 square_casper(d3, d2);
3282 square_casper(e3, d3);
3283 square_casper(d3, e3);
3284 square_casper(e3, d3);
3285 multiply_casper(d2, e3, T4);
3286
3287 square_casper(d3, d2);
3288 multiply_casper(d2, d3, a);
3289
3290 // d3 = 2 ^ 2 * d2
3291 square_casper(d3, d2);
3292 square_casper(e3, d3);
3293 multiply_casper(c, e3, a);
3294 }
3295
toMontgomery_ECC_P521(uint32_t * C,uint32_t * A)3296 static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A)
3297 {
3298 /* R^2 = 2^1088 mod p, used to convert values to Montgomery form. */
3299 // uint32_t R2[NUM_LIMBS] = { 0x00000000, 0x4000, 0, 0,
3300 // 0, 0, 0, 0,
3301 // 0, 0, 0, 0,
3302 // 0 };
3303 /* R^2 = 2^1152 mod p, used to convert values to Montgomery form. */
3304 uint32_t R2[kCASPER_ECC_P521_wordlen] = {0, 0, 0, 0x4000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3305
3306 uint32_t *T1, *T2, *T3;
3307 T1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3308 T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3309 T3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3310
3311 CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3312 CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3313
3314 multiply_casper(T3, T2, T1);
3315 CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3316 }
3317
MultprecCiosMul521_ct(uint32_t w_out[],const uint32_t a[],const uint32_t b[],const uint32_t Nmod[],const uint32_t * Np)3318 static void MultprecCiosMul521_ct(
3319 uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
3320 {
3321 uint32_t j;
3322 uint64_t carry;
3323 uint64_t *a64, *b64, *w64;
3324
3325 uint32_t *T1 = &CASPER_MEM[0], borrow;
3326
3327 a64 = (uint64_t *)(uintptr_t)a;
3328 b64 = (uint64_t *)(uintptr_t)b;
3329 w64 = (uint64_t *)(uintptr_t)w_out;
3330
3331 if (a != NULL)
3332 { /* if !a, we are reducing only */
3333 PreZeroW(j, w_out);
3334 }
3335 SET_DWORD(&w64[N_dwordlen], 0ULL);
3336 SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
3337 /* with accelerator */
3338
3339 /* loop j and then reduce after each j round */
3340 for (j = 0; j < N_dwordlen; j++)
3341 {
3342 /* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
3343 /* push-pull: we do a*b and then separately m*n (reduce) */
3344 if (a != NULL)
3345 { /* if mul&reduce vs. reduce only */
3346 carry = GET_DWORD(&w64[N_dwordlen]);
3347 Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
3348 Accel_crypto_mul(
3349 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
3350 Accel_done();
3351 /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
3352 /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
3353 /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
3354 /* w64[N_dwordlen+1] = g_carry; */
3355 carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
3356 SET_DWORD(&w64[N_dwordlen + 1U], carry);
3357 }
3358
3359 /* Fast reduction using only shifts for this special shape:
3360 * (c - (-p^-1*c mod 2^64) * p)/2^64 =
3361 * (c - c_0 * p)/2^64 =
3362 * (\sum_{j=0}^9 c_i*2^64 - c_0 * p)/2^64 =
3363 * (\sum_{j=0}^9 c_i*2^64 - c_0 * (2^521-1))/2^64 =
3364 * (\sum_{j=0}^9 c_i*2^64 - c_0 * 2^521 - c_0)/2^64 =
3365 * c_1 + c_2*2^64 + c_3*2^128 + c_4*2^192 + c_5*2^256 + c_6*2^320 + c_7*2^384 + c_8*2^448 + c_9*2^512 + c_0 *
3366 * 2^{448 + 9} so one only needs to compute this 128-bit addition: [c_8, c_9] + c_0 * 2^9
3367 */
3368
3369 uint64_t *p64 = (uint64_t *)(uintptr_t)T1;
3370
3371 /* p64[0] = w64[0] << 9;*/
3372 SET_DWORD(&p64[0], GET_DWORD(&w64[0]) << 9U);
3373 /* p64[1] = w64[0] >> (64 - 9); */
3374 SET_DWORD(&p64[1], GET_DWORD(&w64[0]) >> (64 - 9));
3375 /* w64[0] = w64[1]; */
3376 SET_DWORD(&w64[0], GET_DWORD(&w64[1]));
3377 /* w64[1] = w64[2]; */
3378 SET_DWORD(&w64[1], GET_DWORD(&w64[2]));
3379 /* w64[2] = w64[3]; */
3380 SET_DWORD(&w64[2], GET_DWORD(&w64[3]));
3381 /* w64[3] = w64[4]; */
3382 SET_DWORD(&w64[3], GET_DWORD(&w64[4]));
3383 /* w64[4] = w64[5]; */
3384 SET_DWORD(&w64[4], GET_DWORD(&w64[5]));
3385 /* w64[5] = w64[6]; */
3386 SET_DWORD(&w64[5], GET_DWORD(&w64[6]));
3387 /* w64[6] = w64[7]; */
3388 SET_DWORD(&w64[6], GET_DWORD(&w64[7]));
3389
3390 /* Compute p64 = p64 + {w64[8], w64[9]} using one additonal double-length limb,
3391 * where p64 = w64[0] * 2^9.
3392 */
3393 Accel_SetABCD_Addr(CA_MK_OFF(&w64[8]), 0);
3394 Accel_crypto_mul(Accel_IterOpcodeResaddr(2, (uint32_t)kCASPER_OpAdd64, /* kCASPER_OpAdd64, */
3395 CA_MK_OFF(p64)));
3396 Accel_done();
3397
3398 /* w64[7] = p64[0]; */
3399 SET_DWORD(&w64[7], GET_DWORD(&p64[0]));
3400 /* w64[8] = p64[1]; */
3401 SET_DWORD(&w64[8], GET_DWORD(&p64[1]));
3402 /* w64[9] = 0; */
3403 SET_DWORD(&w64[9], (uint64_t)0U);
3404 }
3405
3406 /* memcpy(T1, w_out, (NUM_LIMBS + 1) * sizeof(uint32_t)); */
3407 /* now check if need to subtract Nmod */
3408 CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
3409
3410 /* Compute w = w - N */
3411 Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
3412 Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
3413 Accel_done();
3414
3415 /* if w_out > T1 then there was a borrow */
3416 /* borrow = (((uint32_t*)w_out)[NUM_LIMBS] > T1[NUM_LIMBS]); */
3417 borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
3418 SET_WORD(&w_out[N_wordlen + 1U], 0);
3419 SET_WORD(&w_out[N_wordlen], 0);
3420 /* w_out[NUM_LIMBS + 1] = 0; */
3421 /* w_out[NUM_LIMBS] = 0; */
3422 casper_select(w_out, w_out, T1, (int32_t)borrow, (int32_t)N_wordlen);
3423 }
3424
3425 #if defined(__GNUC__)
3426 /* End of enforcing O1 optimize level for gcc*/
3427 #pragma GCC pop_options
3428 #endif
3429
3430 #if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
3431 // End of enforcing optimize off for clang
3432 #pragma clang optimize on
3433 #endif
3434