1 /*
2  * Copyright 2018-2021 NXP
3  * All rights reserved.
4  *
5  *
6  * SPDX-License-Identifier: BSD-3-Clause
7  */
8 
9 #include "fsl_casper.h"
10 #include <math.h> /* ceil TODO check if really need it */
11 
12 /*******************************************************************************
13  * Definitions
14  ******************************************************************************/
15 
16 /* Component ID definition, used by tools. */
17 #ifndef FSL_COMPONENT_ID
18 #define FSL_COMPONENT_ID "platform.drivers.casper"
19 #endif
20 
21 /* Recoding length for the secure scalar multiplication:
22  *  Use n=256 and w=4 --> compute ciel(384/3) = 86 + 1 digits
23  *  Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits
24  *  Use n=521 and w=4 --> compute ciel(521/3) = 174 + 1 digits
25  */
26 
27 /*!<  Recoding length for the secure scalar multiplication */
28 enum _casper_ecc_recode_len
29 {
30     kCASPER_ECC_P256_recode_len = 87u,
31     kCASPER_ECC_P384_recode_len = 129u,
32     kCASPER_ECC_P521_recode_len = 175u,
33 };
34 
35 enum _casper_ecc_N_bitlen
36 {
37     kCASPER_ECC_P256_N_bitlen = 256u,
38     kCASPER_ECC_P384_N_bitlen = 384u,
39     kCASPER_ECC_P521_N_bitlen = 576u,
40 };
41 
42 enum _casper_ecc_N_wordlen
43 {
44     kCASPER_ECC_P256_wordlen = 256U / 32U,
45     kCASPER_ECC_P384_wordlen = 384u / 32U,
46     kCASPER_ECC_P521_wordlen = 576u / 32U,
47 };
48 
49 #if defined(__GNUC__)
50 /* Enforce O1 optimize level, specifically to remove strict-aliasing option.
51   (-fno-strict-aliasing is required for this driver). */
52 #pragma GCC push_options
53 #pragma GCC optimize("-O1")
54 #endif
55 
56 #if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
57 /* Enforce optimization off for clang, specifically to remove strict-aliasing option.
58 (-fno-strict-aliasing is required for this driver). */
59 #pragma clang optimize off
60 #endif
61 
62 /* CASPER driver allows usage of 256, 384 and 521 ECC */
63 #define CASPER_MAX_ECC_SIZE_WORDLEN (576u / 32U)
64 #define CASPER_RECODE_LENGTH_MAX    175
65 
66 #define CASPER_RAM_BASE_NS (FSL_FEATURE_CASPER_RAM_BASE_ADDRESS)
67 
68 #if defined(FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED) && FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED
69 #define CASPER_RAM_OFFSET (FSL_FEATURE_CASPER_RAM_OFFSET)
70 #define INTERLEAVE(addr)                                                                                        \
71     (((((((addr) >> 2U) & 0x00000001U) << CASPER_RAM_OFFSET) + (((addr) >> 3U) << 2U) + ((addr)&0x00000003U)) & \
72       0xFFFFU) |                                                                                                \
73      s_casperRamBase)
74 #define DEINTERLEAVE(addr)    INTERLEAVE(addr)
75 #define GET_WORD(addr)        (*((uint32_t *)DEINTERLEAVE((uint32_t)(addr))))
76 #define GET_DWORD(addr)       (((uint64_t)GET_WORD(addr)) | (((uint64_t)GET_WORD(((uint32_t)(addr)) + 4U)) << 32U))
77 #define SET_WORD(addr, value) *((uint32_t *)INTERLEAVE((uint32_t)(addr))) = ((uint32_t)(value))
78 #define SET_DWORD(addr, value)                                                               \
79     do                                                                                       \
80     {                                                                                        \
81         SET_WORD(addr, (uint32_t)(value & 0xFFFFFFFFU));                                     \
82         SET_WORD(((uint32_t)(addr)) + 4U, (uint32_t)((value & 0xFFFFFFFF00000000U) >> 32U)); \
83     } while (false)
84 
85 /* memcopy is always word aligned */
86 /* interleaved to interleaved
87   static void CASPER_MEMCPY_I2I(void *dst, const void *src, size_t siz)
88  */
89 #define CASPER_MEMCPY_I2I(dst, src, siz)                                   \
90                                                                            \
91     {                                                                      \
92         uint32_t *dst32       = (uint32_t *)(dst);                         \
93         const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
94         uint32_t i;                                                        \
95         for (i = 0U; i < (siz) / 4U; i++)                                  \
96         {                                                                  \
97             SET_WORD(&dst32[i], GET_WORD(&src32[i]));                      \
98         }                                                                  \
99     }
100 
101 /* interleaved to non-interleaved
102    static void CASPER_MEMCPY_I2N(void *dst, const void *src, size_t siz)
103  */
104 #define CASPER_MEMCPY_I2N(dst, src, siz)                                   \
105                                                                            \
106     {                                                                      \
107         uint32_t *dst32       = (uint32_t *)(dst);                         \
108         const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
109         uint32_t i;                                                        \
110         for (i = 0U; i < (siz) / 4U; i++)                                  \
111         {                                                                  \
112             dst32[i] = GET_WORD(&src32[i]);                                \
113         }                                                                  \
114     }
115 
116 /* non-interleaved to interleaved
117    static void CASPER_MEMCPY_N2I(void *dst, const void *src, size_t siz)
118  */
119 #define CASPER_MEMCPY_N2I(dst, src, siz)                                      \
120                                                                               \
121     {                                                                         \
122         volatile uint32_t *dst32 = (uint32_t *)(dst);                         \
123         const uint32_t *src32    = (const uint32_t *)(const uint32_t *)(src); \
124         uint32_t i;                                                           \
125         for (i = 0U; i < (siz) / 4U; i++)                                     \
126         {                                                                     \
127             SET_WORD(&dst32[i], src32[i]);                                    \
128         }                                                                     \
129     }
130 #else
131 #define GET_WORD(addr)         (*((uint32_t *)(uint32_t)(addr)))
132 #define GET_DWORD(addr)        (*((uint64_t *)(addr)))
133 #define SET_WORD(addr, value)  *((uint32_t *)(uint32_t)(addr)) = ((uint32_t)(value))
134 #define SET_DWORD(addr, value) *((uint64_t *)(addr)) = ((uint64_t)(value))
135 
136 #define CASPER_MEMCPY_I2I(dst, src, siz) (void)memcpy(dst, src, siz)
137 #define CASPER_MEMCPY_I2N(dst, src, siz) (void)memcpy(dst, src, siz)
138 #define CASPER_MEMCPY_N2I(dst, src, siz) (void)memcpy(dst, src, siz)
139 #endif
140 
141 #define WORK_BUFF_MUL4 (N_wordlen_max * 4 + 2) /* ! working buffer is 4xN_wordlen to allow in place math */
142 #define N_bytelen      (N_wordlen * 4U)        /*  for memory copy and the like */
143 #define N_dwordlen     (unsigned)(N_wordlen / 2U)
144 
145 #define PreZeroW(i, w_out)                     \
146     for ((i) = 0U; (i) < N_wordlen; (i) += 4U) \
147     {                                          \
148         SET_WORD(&(w_out)[(i) + 0U], 0U);      \
149         SET_WORD(&(w_out)[(i) + 1U], 0U);      \
150         SET_WORD(&(w_out)[(i) + 2U], 0U);      \
151         SET_WORD(&(w_out)[(i) + 3U], 0U);      \
152     } /*  unrolled partly */
153 #define PreZeroW2up(i, w_out)                         \
154     for (i = N_wordlen; i <= N_wordlen * 2U; i += 4U) \
155     {                                                 \
156         SET_WORD(&w_out[i + 0U], 0U);                 \
157         SET_WORD(&w_out[i + 1U], 0U);                 \
158         SET_WORD(&w_out[i + 2U], 0U);                 \
159         SET_WORD(&w_out[i + 3U], 0U);                 \
160     } /*  unrolled partly */
161 
162 /* Macros for the ECC component in Casper */
163 
164 /* CASPER memory layout for ECC */
165 
166 #define CASPER_MEM ((uint32_t *)msg_ret)
167 
168 /* Currently these macros work on 32-bit platforms  */
169 
170 #define add(c1, c0, a, b)        \
171                                  \
172     do                           \
173     {                            \
174         uint32_t _t;             \
175         _t = a + b;              \
176         c1 = (uint32_t)(_t < a); \
177         c0 = _t;                 \
178                                  \
179     } while (false)
180 
181 #define add_cout(carry, c, a, b) add((carry), (c), (a), (b))
182 
183 #define add_cout_cin(carryout, c, a, b, carryin)       \
184     do                                                 \
185     {                                                  \
186         uint64_t _t = (uint64_t)(a) + (b) + (carryin); \
187         (c)         = (uint32_t)_t;                    \
188         (carryout)  = (uint32_t)(_t >> 32);            \
189     } while (false)
190 
191 #define sub_borrowout(borrow, c, a, b)       \
192     do                                       \
193     {                                        \
194         uint32_t _b = (uint32_t)((b) > (a)); \
195         (c)         = (a) - (b);             \
196         (borrow)    = _b;                    \
197     } while (false)
198 
199 #define sub_borrowin_borrowout(borrowout, c, a, b, borrowin) \
200     do                                                       \
201     {                                                        \
202         uint32_t _t, _borrow1, _borrow2;                     \
203         sub_borrowout(_borrow1, _t, (a), (b));               \
204         sub_borrowout(_borrow2, (c), _t, (borrowin));        \
205         (borrowout) = _borrow1 + _borrow2;                   \
206     } while (false)
207 
208 #define sub_borrowout_1(borrow, c, a) \
209     do                                \
210     {                                 \
211         uint32_t _b = 0;              \
212         c           = a - b;          \
213         borrow      = _b;             \
214     } while (false)
215 
216 #define sub_borrowin_borrowout_1(borrowout, c, a, borrowin) \
217     do                                                      \
218     {                                                       \
219         uint32_t _t, _borrow1, _borrow2;                    \
220         sub_borrowout_1(_borrow1, _t, a);                   \
221         sub_borrowout(_borrow2, c, _t, borrowin);           \
222         borrowout = _borrow1 + _borrow2;                    \
223     } while (false)
224 
225 /* 32 x 32 --> 64-bit multiplication
226  * (c1,c0) = a * b
227  */
228 #define mul(c1, c0, a, b)                      \
229                                                \
230     do                                         \
231     {                                          \
232         uint64_t __m;                          \
233         __m = (uint64_t)a * (uint64_t)b;       \
234         c0  = (uint32_t)__m;                   \
235         c1  = (uint32_t)(__m >> (uint64_t)32); \
236                                                \
237     } while (false)
238 
239 /* Multiply-and-accumulate
240  * (c1,c0) = a*b+c0
241  */
242 #define muladd(c1, c0, a, b)   \
243                                \
244     do                         \
245     {                          \
246         uint32_t __ma = c0;    \
247         mul(c1, c0, a, b);     \
248         c0 = c0 + __ma;        \
249         c1 = c1 + (c0 < __ma); \
250                                \
251     } while (0)
252 
253 /* Multiply-and-accumulate-accumulate
254  * (c1,c0) = a*b+c0+c1
255  */
256 #define muladdadd(c1, c0, a, b)            \
257                                            \
258     do                                     \
259     {                                      \
260         uint32_t __maa0 = c0, __maa1 = c1; \
261         mul(c1, c0, a, b);                 \
262         c0 = c0 + __maa0;                  \
263         c1 = c1 + (c0 < __maa0);           \
264         c0 = c0 + __maa1;                  \
265         c1 = c1 + (c0 < __maa1);           \
266                                            \
267     } while (0)
268 
269 #define square_casper(c, a) multiply_casper(c, a, a)
270 #define sub_casper(c, a, b) CASPER_montsub(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
271 #define add_casper(c, a, b) CASPER_montadd(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
272 #define mul2_casper(c, a)   add_casper(c, a, a)
273 #define half(c, a, b)       CASPER_half(c, a, b)
274 /*******************************************************************************
275  * Variables
276  ******************************************************************************/
277 
278 /*  The model for this algo is that it can be implemented for a fixed size RSA key */
279 /*  for max speed. If this is made into a variable (to allow varying size), then */
280 /*  it will be slower by a bit. */
281 /*  The file is compiled with N_bitlen passed in as number of bits of the RSA key */
282 /*  #define N_bitlen 2048 */
283 static size_t N_wordlen = 0U; /* ! number of words (e.g. 4096/32 is 128 words) */
284 
285 static uint32_t s_casperRamBase = CASPER_RAM_BASE_NS;
286 static uint32_t *msg_ret        = (uint32_t *)CASPER_RAM_BASE_NS;
287 
288 /* NISTp-256 = 2^256-2^224+2^192+2^96-1 */
289 static uint32_t NISTp256[256 / 32u] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0x00000000,
290                                        0x00000000,  0x00000000,  0x00000001,  0xffffffffU};
291 
292 /* The cardinality of the curve E(F_p) */
293 static uint32_t NISTp256_q[256 / 32u] = {0xfc632551U, 0xf3b9cac2U, 0xa7179e84U, 0xbce6faadU,
294                                          0xffffffffU, 0xffffffffU, 0x00000000,  0xffffffffU};
295 
296 /* R = 2^256 mod p, the value "1" in Montgomery form. */
297 static uint32_t NISTr256[256 / 32u] = {0x00000001,  0x00000000,  0x00000000,  0xffffffffU,
298                                        0xffffffffU, 0xffffffffU, 0xfffffffeU, 0x00000000};
299 
300 static uint32_t Np256[2] = {1, 0};
301 
302 /* NISTp-384 =  2^384 - 2^128 - 2^96 + 2^32 - 1 */
303 static uint32_t NISTp384[384 / 32u] = {0xffffffffU, 0x00000000,  0x00000000,  0xffffffffU, 0xfffffffeU, 0xffffffffU,
304                                        0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
305 
306 /* The cardinality of the curve E(F_p) */
307 static uint32_t NISTp384_q[384 / 32u] = {0xccc52973U, 0xecec196aU, 0x48b0a77aU, 0x581a0db2U, 0xf4372ddfU, 0xc7634d81U,
308                                          0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
309 
310 /* R = 2^256 mod p, the value "1" in Montgomery form. */
311 static uint32_t NISTr384[384 / 32u] = {0x00000001, 0xffffffffU, 0xffffffffU, 0x00000000, 0x1, 0, 0, 0, 0, 0, 0, 0};
312 
313 // -p^-1 mod 2^64 = 0x100000001
314 static uint32_t Np384[2] = {1, 1};
315 
316 /* NISTp-521 =  2^521 - 1 */
317 static uint32_t NISTp521[576 / 32U] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
318                                        0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
319                                        0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU,      0};
320 
321 /* The cardinality of the curve E(F_p) */
322 static uint32_t NISTp521_q[576 / 32U] = {0x91386409U, 0xbb6fb71eU, 0x899c47aeU, 0x3bb5c9b8U, 0xf709a5d0U, 0x7fcc0148U,
323                                          0xbf2f966bU, 0x51868783U, 0xfffffffaU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
324                                          0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU,      0};
325 
326 /* R = 2^576 mod p, the value "1" in Montgomery form. */
327 static uint32_t NISTr521[576 / 32U] = {0, 0x800000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
328 
329 /* -p^-1 mod 2^64 = 1 */
330 static uint32_t Np521[2] = {1, 0};
331 
332 /*******************************************************************************
333  * Prototypes
334  ******************************************************************************/
335 
336 /* Convert a projective point (X1 : Y1 : Z1)
337  * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
338  * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
339  */
340 void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
341 
342 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
343  *  where (X1: Y1: Z1) != (X2 : Y2 : Z2)
344  * (X3 : Y3: Z3) may be the same as one of the inputs.
345  */
346 void Jac_addition(uint32_t *X3,
347                   uint32_t *Y3,
348                   uint32_t *Z3,
349                   uint32_t *X1,
350                   uint32_t *Y1,
351                   uint32_t *Z1,
352                   uint32_t *X2,
353                   uint32_t *Y2,
354                   uint32_t *Z2);
355 
356 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
357  * where (X1: Y1: Z1) != (X2, Y2)
358  * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
359  * Source: 2004 Hankerson?Menezes?Vanstone, page 91.
360  */
361 void Jac_add_affine(
362     uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2);
363 
364 /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
365  * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
366  * (X3 : Y3: Z3) may be the same as the input.
367  */
368 void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
369 
370 /* Constant time elliptic curve scalar multiplication.
371  * Source: https://eprint.iacr.org/2014/130.pdf
372  * when using w = 4.
373  * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
374  * p is the prime used to define the finite field F_p
375  * q is the (prime) order of the curve
376  */
377 void Jac_scalar_multiplication(
378     uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q);
379 
380 /* Compute the double scalar multiplication
381  * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
382  * Using Shamir's trick and precomputing 16 points.
383  * This code is *not* constant time since this is used
384  * for verification only.
385  */
386 void double_scalar_multiplication(uint32_t *X3,
387                                   uint32_t *Y3,
388                                   uint32_t *Z3,
389                                   uint32_t *X1,
390                                   uint32_t *Y1,
391                                   uint32_t *k1,
392                                   uint32_t *X2,
393                                   uint32_t *Y2,
394                                   uint32_t *k2);
395 
396 /* Compute inversion modulo NIST-p384 using Fermats little theorem.
397  * Using c = a^(p-2) = a^(-1) mod p.
398  * This computes the modular inversion if all arithmetic is "regular"
399  * modular arithmetic or computes automatically the Montgomery inverse
400  * if all arithmetic is Montgomery arithmetic.
401  */
402 static void invert_mod_p384(uint32_t *c, uint32_t *a);
403 
404 /* Modular inversion for NIST-P256 */
405 static void invert_mod_p256(uint32_t *c, uint32_t *a);
406 
407 /* Modular inversion for NIST-P521 */
408 static void invert_mod_p521(uint32_t *c, uint32_t *a);
409 
410 // A and C do not need to be in Casper memory
411 static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A);
412 static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A);
413 static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A);
414 
415 static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
416 static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
417 
418 /* Compute c = a/2 mod p where b is scratch space. */
419 static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b);
420 
421 void CASPER_MEMCPY(void *dst, const void *src, size_t siz);
422 
423 static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[]);
424 
425 static uint8_t int8abs(int8_t v);
426 
427 /* Constant time select c = a if m = 0 or
428  *                      c = b if m = 1
429  * a, b, c are n words
430  */
431 static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n);
432 
433 /* Dumb n-limb addition of c=a+b, return carry. */
434 static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
435 
436 #if 0
437 /* Dumb n-limb addition of c=a+b, return carry. */
438 static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
439 
440 /* Dumb n-limb subtraction of c=a-b, return borrow. */
441 static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
442 #endif
443 
444 /* Dumb n-limb subtraction of c=a-b, return borrow. */
445 static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
446 
447 int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
448                                  const unsigned exp_pubkey,
449                                  const unsigned pubkey[N_wordlen_max],
450                                  unsigned MsgRet[WORK_BUFF_MUL4]);
451 
452 int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
453                                      const unsigned exp_pubkey,
454                                      const unsigned pubkey[N_wordlen_max],
455                                      unsigned MsgRet[WORK_BUFF_MUL4]);
456 
457 void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[]);
458 
459 void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret);
460 void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[]);
461 void MultprecModulo(unsigned r_out[], const unsigned v[], int top);
462 void MultprecCiosMul(
463     unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np);
464 void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[]);
465 
466 static void MultprecCiosMul_ct(
467     uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
468 
469 static void MultprecCiosMul521_ct(
470     uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
471 
472 static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c);
473 static void shiftright(uint32_t *z, uint32_t *x, uint32_t c);
474 static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c);
475 
476 /*******************************************************************************
477  * Code
478  ******************************************************************************/
479 
CA_MK_OFF(const void * addr)480 __STATIC_FORCEINLINE uint32_t CA_MK_OFF(const void *addr)
481 {
482     return ((uint32_t)(const uint32_t *)addr - s_casperRamBase);
483 }
484 
485 #if 1
Accel_done(void)486 __STATIC_FORCEINLINE void Accel_done(void)
487 {
488     register uint32_t status;
489     do
490     {
491         status = CASPER->STATUS;
492     } while (0U == (status & CASPER_STATUS_DONE_MASK));
493 }
494 
Accel_SetABCD_Addr(uint32_t ab,uint32_t cd)495 __STATIC_FORCEINLINE void Accel_SetABCD_Addr(uint32_t ab, uint32_t cd)
496 {
497     CASPER->CTRL0 = ab | (cd << 16); /* CDoffset << 16 | ABoffset */
498 }
499 
Accel_crypto_mul(uint32_t ctrl1)500 __STATIC_FORCEINLINE void Accel_crypto_mul(uint32_t ctrl1)
501 {
502     CASPER->CTRL1 = ctrl1;
503 }
504 #else
505 #include "intrinsics.h"
506 #define Accel_done()                                       \
507     {                                                      \
508         register uint32_t status;                          \
509         do                                                 \
510         {                                                  \
511             status = CASPER_Rd32b(CASPER_CP_STATUS);       \
512         } while (0 == (status & CASPER_STATUS_DONE_MASK)); \
513     }
514 #if 0
515 __STATIC_FORCEINLINE void Accel_done(void)
516 {
517     register uint32_t status;
518     do
519     {
520         status = CASPER->STATUS;
521     } while (0 == (status & CASPER_STATUS_DONE_MASK));
522 }
523 #endif
524 #define Accel_SetABCD_Addr(ab, cd) CASPER_Wr32b((uint32_t)ab | ((uint32_t)cd << 16), CASPER_CP_CTRL0);
525 #define Accel_crypto_mul(ctrl1)    CASPER_Wr32b((uint32_t)ctrl1, CASPER_CP_CTRL1);
526 #endif
527 
Accel_IterOpcodeResaddr(uint32_t iter,uint32_t opcode,uint32_t resAddr)528 __STATIC_FORCEINLINE uint32_t Accel_IterOpcodeResaddr(uint32_t iter, uint32_t opcode, uint32_t resAddr)
529 {
530     return CASPER_CTRL1_ITER(iter) | CASPER_CTRL1_MODE(opcode) | (resAddr << 16);
531 }
532 
CASPER_MEMCPY(void * dst,const void * src,size_t siz)533 void CASPER_MEMCPY(void *dst, const void *src, size_t siz)
534 {
535     bool bdst =
536         ((((uint32_t)(uint32_t *)dst) | 0x10000000u) >= ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
537          (((uint32_t)(uint32_t *)dst) | 0x10000000u) <
538              ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
539 
540     bool bsrc = ((((uint32_t)(const uint32_t *)src) | 0x10000000u) >=
541                      ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
542                  (((uint32_t)(const uint32_t *)src) | 0x10000000u) <
543                      ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
544 
545     if (bdst && bsrc)
546     {
547         CASPER_MEMCPY_I2I(dst, src, siz);
548     }
549     else if (bdst && !bsrc)
550     {
551         CASPER_MEMCPY_N2I(dst, src, siz);
552     }
553     else if (!bdst && bsrc)
554     {
555         CASPER_MEMCPY_I2N(dst, src, siz);
556     }
557     else
558     {
559         (void)memcpy(dst, src, siz);
560     }
561 }
562 
563 /* Constant time select c = a if m = 0 or
564  *                      c = b if m = 1
565  * a, b, c are n words
566  */
casper_select(uint32_t * c,uint32_t * a,uint32_t * b,int m,int n)567 static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n)
568 {
569     uint32_t m1 = 0U - (uint32_t)m, m2 = ~m1;
570     int i;
571 
572     for (i = 0; i < n; i++)
573     {
574         SET_WORD(&c[i], (GET_WORD(&a[i]) & m2) | (GET_WORD(&b[i]) & m1));
575     }
576 }
577 
578 /*  Compute R`, which is R mod N. This is done using subtraction */
579 /*  R has 1 in N_wordlen, but we do not fill it in since borrowed. */
580 /*  Exp-pubkey only used to optimize for exp=3 */
MultprecMontCalcRp(unsigned Rp[],const unsigned exp_pubkey,const unsigned Nmod[])581 void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[])
582 {
583     uint32_t i;
584 
585     /*  R is 2^n where n is 1 bit longer than Nmod, so 1 followed by 32 or 64 0 words for example */
586     /*  Note that Nmod's upper most bit has to be 1 by definition, so one subtract is enough. We */
587     /*  do not set the 1 since it is "borrowed" so no point */
588     PreZeroW(i, Rp);
589     Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
590     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(Rp)));
591     Accel_done();
592     /*  final borrow cannot happen since we know we started with a larger number */
593 }
594 
595 /*  MultprecMultiply - multiple w=u*v (per Knuth) */
596 /*  w_out is 2x the size of u and v */
MultprecMultiply(unsigned w_out[],const unsigned u[],const unsigned v[])597 void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[])
598 {
599     uint32_t i, j;
600 
601     /*  Knuth 4.3.1 - Algorithm M */
602     /*    Compute w = u * v */
603     /*  u and v are N bits long in 32 bit word form */
604     /*  w is 2*N bits long in 32 bit word form */
605     /*  Note: We just multiply in place */
606 
607     /*  Step 1. Fill w[t-1:0] with 0s, the upper half will be written as we go */
608     PreZeroW(i, w_out);
609 
610     /*  We do 1st pass NOSUM so we do not have to 0 output */
611     Accel_SetABCD_Addr(CA_MK_OFF(&v[0]), CA_MK_OFF(u));
612     Accel_crypto_mul(
613         Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464NoSum, CA_MK_OFF(&w_out[0])));
614     Accel_done();
615     /*  Step 2. iterate over N words of v using j */
616     for (j = 2U; j < N_wordlen; j += 2U)
617     {
618         /*  Step 2b. Check for 0 on v word - skip if so since we 0ed already */
619         /*  Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
620         if (0U != (GET_WORD(&v[j])) || 0U != (GET_WORD(&v[j + 1U])))
621         {
622             Accel_SetABCD_Addr(CA_MK_OFF(&v[j]), CA_MK_OFF(u));
623             Accel_crypto_mul(
624                 Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464Sum, CA_MK_OFF(&w_out[j])));
625             Accel_done();
626         }
627     }
628 }
629 
630 /*  MultprecModulo performs divide to get remainer as needed for RSA */
631 /*  This performs (q,r) = u/v, but we do not keep q */
632 /*  r_out is module (remainder) and is 2*N */
633 /*  u is in r_out (1st N) at start (passed in) */
634 /*  v is N long */
MultprecModulo(unsigned r_out[],const unsigned v[],int top)635 void MultprecModulo(unsigned r_out[], const unsigned v[], int top)
636 {
637     uint64_t u64;                      /*  use 64 bit math mixed with 32 bit */
638     unsigned u32;                      /*  allows us to work on U in 32 bit */
639     unsigned u_n, ul16, uh16, *u_shft; /*  u_shft is because r_out is u initially */
640     unsigned vl16, vh16, v_Nm1;
641     unsigned q_hat, r_hat, q_over;
642     unsigned borrow, carry;
643     uint32_t i;
644     int j, tmp;
645 
646     /*  Knuth 4.3.1 - Algorithm D */
647     /*    Compute q = u / v giving remainder r = u mod v */
648     /*    -- we only want r, so we build qhat but do not store the Qs */
649     /*  v is N long, with u,q,r 2N long because u is slowly replavced by r. */
650     /*  We normalize/unnormlize per Knuth in the buffer (not copied) */
651 
652     /*  Step 1. Normalize value so MSb is in v[n-1]. Remember that v is */
653     /*  the public key - to call it a 2048 bit number, they cannot have 0 */
654     /*  in the MSb (or it would be less than 2048 bits) and so we know we */
655     /*  are normalized already. Therefore, u is effectively shifted already. */
656     /*  For u, we have it in r_out. u[n] holds any overflow */
657     /*  Since divide on CM3/4 is 32/32=32, we break into 16 bit halves, but */
658     /*  multiply can be 32x32=64. */
659     u_n    = 0;
660     u_shft = r_out;                       /*  u (shifted) is in r_out */
661 
662     v_Nm1 = GET_WORD(&v[N_wordlen - 1U]); /*  MSw of public key */
663     vl16  = v_Nm1 & 0xFFFFU;              /*  lower 16 */
664     vh16  = v_Nm1 >> 16;                  /*  upper 16 */
665     /*  Step 2. Iterate j from m-n down to 0 (M selected per Knuth as 2*N) */
666     for (j = top; j >= 0; j--)
667     {
668         /*  Step 3. estimate q_hat as (U[j+n]*B + U[j+n-1]) / V[n-1] */
669         /*  Note: using subset of Knuth algo since v is 1/2 len of u (which is */
670         /*  from multiply or x^2 leading into this). */
671         u32  = u_n;                                   /*  pickup u4u3u2, knowing u4 is 0 */
672         u64  = ((uint64_t)u_n << 32) | GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 1U]);
673         ul16 = (unsigned int)(u64 & 0xFFFFU);         /*  lower 16 */
674         uh16 = (unsigned int)((u64 >> 16) & 0xFFFFU); /*  upper 16 */
675 
676         /*  we see if even possible (u large enough relative to v) */
677         if ((u32 - v_Nm1) <= u32)
678         {
679             u32 -= v_Nm1;
680             q_over = 1; /*  overflow from the sub */
681         }
682         else
683         {
684             q_over = 0;
685         }
686         /*  q_hat = u32 / vh16 -- is the upper partial value */
687         /*  estimate; if too much, then back down by 1 or 2 */
688         q_hat = u32 / vh16;
689         r_hat = u32 - (q_hat * vh16);
690         /*  see if Q is more than 16 bits or remainder is too large  (over div) */
691         if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | uh16)))
692         {
693             /*  too much - undo a division */
694             q_hat--;
695             r_hat += vh16;
696             /*  check if still too much */
697             if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | uh16)))
698             {
699                 q_hat--; /*  yes, so undo a 2nd */
700             }
701         }
702 
703         /*  compose u3u2uh16, then sub q_hat*v if OK */
704         u64 = (((uint64_t)u32 << 16) | uh16) - ((uint64_t)q_hat * v_Nm1);
705         if (0U != (u64 >> 48))
706         {
707             /*  no, so add v back */
708             u32 = (unsigned)(u64 + v_Nm1);
709             q_hat--;
710         }
711         else
712         {
713             u32 = (unsigned)u64;
714         }
715         tmp = (int32_t)(uint32_t)(q_hat << 16); /*  quotient upper part */
716 
717         /*  divide lower part: q = u2uh16ul16 / v. */
718         /*  estimate and add back if over divdied */
719         q_hat = u32 / vh16;
720         r_hat = u32 - (q_hat * vh16);
721         if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | ul16)))
722         {
723             /*  too much - undo a division */
724             q_hat--;
725             r_hat += vh16;
726             /*  check if still too much */
727             if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | ul16)))
728             {
729                 q_hat--; /*  yes, so undo a 2nd */
730             }
731         }
732 
733         /*  compose u2uh16ul16, then sub q_hat*v if OK */
734         u64 = (((uint64_t)u32 << 16) | ul16) - ((uint64_t)q_hat * v_Nm1);
735         if (0U != (u64 >> 48))
736         {
737             /*  no, so add v back */
738             r_hat = (unsigned)(u64 + v_Nm1);
739             q_hat--;
740         }
741         else
742         {
743             r_hat = (unsigned)u64;
744         }
745         q_hat |= (unsigned)tmp; /*  other half of the quotient */
746         while ((q_over != 0U) || ((uint64_t)q_hat * GET_WORD(&v[N_wordlen - 2U])) >
747                                      ((1ULL << 32) * r_hat) + (uint64_t)GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 2U]))
748         { /*  if Qhat>b, then reduce to b-1, then adjust up Rhat */
749             q_hat--;
750             r_hat += v_Nm1;
751             if (r_hat < v_Nm1)
752             {
753                 break; /*  no overflow */
754                        /*  else repeat since Rhat >= b */
755             }
756         }
757 
758         /*  Step 4. Multiply and subtract. We know the amount, */
759         /*          so we do the schoolboy math. Have to do on */
760         /*          the large value. */
761         if (q_hat != 0U)
762         {
763             borrow = 0;
764             for (i = 0; i < N_wordlen; i++)
765             {
766                 u64    = (uint64_t)q_hat * GET_WORD(&v[i]) + borrow;
767                 borrow = (unsigned)(u64 >> 32);
768                 if (GET_WORD(&u_shft[i + (unsigned)j]) < (unsigned)u64)
769                 {
770                     borrow++; /*  carry the overflow */
771                 }
772                 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) - (unsigned)u64);
773             }
774             u_n -= borrow; /*  overflow from shift left does not fit otherwise */
775         }
776 
777         /*  Store 5. (update Q - we don't), and add back V to remainder if we over-subtracted */
778         /*           That restores remainder to correct (we could only be off by 1) */
779         /*           This should happen very rarely. */
780         if (u_n != 0U)
781         {
782             carry = 0;
783             for (i = 0; i < N_wordlen; i++)
784             {
785                 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + carry);
786                 carry = (GET_WORD(&u_shft[i + (unsigned)j]) < carry) ? 1U : 0U;
787                 SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + GET_WORD(&v[i]));
788                 if (GET_WORD(&u_shft[i + (unsigned)j]) < GET_WORD(&v[i]))
789                 {
790                     carry++;
791                 }
792             }
793         }
794         u_n = GET_WORD(
795             &u_shft[(uint32_t)j + N_wordlen - 1U]); /*  hold upper part of u to catch overflow (to borrow from) */
796     }
797     /*  low N bits of r are valid as remainder */
798 }
799 
800 /*  We convert X into a Mont form number. Note length of arrays: */
801 /*  x is N_wordlen, Nmod is N_wordlen */
802 /*  Rp is N_wordlen (it is R` which is R mod N) */
803 /*  Xmont_out is N_wordlen*2+1 */
MultprecMontPrepareX(unsigned Xmont_out[],const unsigned x[],const unsigned Rp[],const unsigned Nmod[])804 void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[])
805 {
806     MultprecMultiply(Xmont_out, x, Rp);
807     MultprecModulo(Xmont_out, Nmod, (int32_t)N_wordlen);
808 }
809 
MultprecGenNp64(const unsigned * Nmod,unsigned * np64_ret)810 void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret) /*  only pass the low order double word */
811 {
812     uint64_t nprime, Nmod_0;
813     Nmod_0 = GET_WORD(&Nmod[0]) | ((uint64_t)GET_WORD(&Nmod[1]) << 32);
814 
815 #define COMP_NPN_1 ((2U - Nmod_0 * nprime) * nprime) /*  computes N`*N0=1 mod 2^P where P is the partial built up */
816     nprime = (((2U + Nmod_0) & 4U) << 1) + Nmod_0;   /*  mod 2^4 */
817     nprime = COMP_NPN_1;
818     nprime = COMP_NPN_1;
819     nprime = COMP_NPN_1;
820     nprime = COMP_NPN_1;
821     /*  8 multiplies of uint64_t */
822     *((uint64_t *)(uintptr_t)np64_ret) = (~0ULL - nprime) + 1ULL;
823 }
824 
825 /*  CIOS Multiply. This is the Coarse Integrated form where the values are */
826 /*  multiplied and reduced for each step of "i". This uses less memory and */
827 /*  is faster as a result. Note that this is used to square as well as mul, */
828 /*  so not as fast as pure squaring could be. */
MultprecCiosMul(unsigned w_out[],const unsigned a[],const unsigned b[],const unsigned Nmod[],const unsigned * Np)829 void MultprecCiosMul(
830     unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np)
831 {
832     int j;
833     uint32_t i;
834     uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
835     uint64_t Np64;
836     uint64_t carry;
837     uint64_t *a64, *b64, *w64, *N64;
838 
839     Np64 = *(uint64_t *)(uintptr_t)Np;
840 
841     a64 = (uint64_t *)(uintptr_t)a;
842     b64 = (uint64_t *)(uintptr_t)b;
843     w64 = (uint64_t *)(uintptr_t)w_out;
844     N64 = (uint64_t *)(uintptr_t)Nmod;
845 
846     if (a != NULL)
847     { /*  if !a, we are reducing only */
848         PreZeroW(i, w_out);
849     }
850     SET_DWORD(&w64[N_dwordlen], 0ULL);
851     SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
852     /*  with accelerator */
853 
854     /*  loop i and then reduce after each j round */
855     for (i = 0; i < N_dwordlen; i++)
856     {
857         /*  Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
858         /*  push-pull: we do a*b and then separately m*n (reduce) */
859         if (a != NULL)
860         { /*  if mul&reduce vs. reduce only */
861             carry = GET_DWORD(&w64[N_dwordlen]);
862             Accel_SetABCD_Addr(CA_MK_OFF(&b64[i]), CA_MK_OFF(a64));
863             Accel_crypto_mul(
864                 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
865             Accel_done();
866             /*  max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
867             /*  so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
868             /*  accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
869             /*  w64[N_dwordlen+1] = g_carry; */
870             carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
871             SET_DWORD(&w64[N_dwordlen + 1U], carry);
872         }
873         SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /*  prime for 1st; modulo a double-word */
874 
875         /*  we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
876         /*  need its carry. We let the accel do this separately - really need a mode to */
877         /*  do this "reduce" since it is natural */
878         carry = GET_DWORD(&w64[N_dwordlen]);
879         Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
880         Accel_crypto_mul(
881             Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
882         Accel_done();
883         carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
884 
885         Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
886         Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
887 
888         Accel_done();
889         SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
890     }
891 
892     /*  now check if need to subtract Nmod */
893     if (0U != (GET_WORD(&w_out[N_wordlen])))
894     {
895         j = 1; /*  we have to subtract for sure if carry up */
896     }
897     else
898     {
899         j = 0;
900         for (i = N_wordlen - 1U; i > 0U; i--)
901         {
902             if (GET_WORD(&w_out[i]) != GET_WORD(&Nmod[i]))
903             {
904                 j = (int32_t)(GET_WORD(&w_out[i]) > GET_WORD(&Nmod[i])); /*  if larger sub */
905                 break; /*  we would remove the break if worrying about side channel */
906             }
907         }
908     }
909     if (0 == j)
910     {
911         return; /*  Is smaller than Nmod, so done. */
912     }
913     Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
914     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
915     Accel_done();
916     /*  last borrow is OK since we know it could only be <2N and */
917 }
918 
919 /*  RSA_MontSignatureToPlaintextFast: */
920 /*  MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
921 /*  exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
922 /*  signature = N bitpos len long "message" to process in Montgomery form - so saving conversion (divide) */
923 /*  pubkey = N bitpos len long public key to process signature with */
924 /*  returns: 0 */
925 /*  */
926 /*  Algo: compute M = signaturen^e mod public_key */
927 /*        where M is original plaintext, signature is signed value */
928 /*        note: e is usually either 0x3 or 0x10001 */
RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],const unsigned exp_pubkey,const unsigned pubkey[N_wordlen_max],unsigned MsgRet[WORK_BUFF_MUL4])929 int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
930                                      const unsigned exp_pubkey,
931                                      const unsigned pubkey[N_wordlen_max],
932                                      unsigned MsgRet[WORK_BUFF_MUL4])
933 {
934     int bidx = 0;
935     int bitpos;
936     unsigned np64[2];
937 
938     /*  MsgRet working area: */
939     /*  0..N = RESULT, starting with S` */
940     /*  N..N*2 = S` and then working BASE during math. */
941     /*  N*2..N*4+2 = temp working area for Mont mul */
942 
943     /*  1. Copy sig into MsgRet so we have one working result buffer */
944     CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
945                       (const uint32_t *)(uintptr_t)mont_signature, N_bytelen);
946     MultprecGenNp64(pubkey, np64);                       /*  Generate N` from LSW of N (LSW being lowest 64b word) */
947     bitpos = (int8_t)(uint8_t)(31U - __CLZ(exp_pubkey)); /*  count of bits after the left most 1 */
948     while (--bitpos >= 0)
949     {
950         /*  This operates on: */
951         /*    result = 1; */
952         /*    base = signature */
953         /*    loop while exponent bits from MSb to LSb */
954         /*      if (exp bit is 1) */
955         /*        result = result * base */
956         /*      base = base^2 */
957         /*  Because the MSb of exp is always 1 by definition, we can invert this a bit: */
958         /*    base = signature` */
959         /*    result = base; equivalent to result = 1*base from 1st pass, but now square is needed 1st */
960         /*    loop while exponent bits from MSb-1 to LSb */
961         /*      base = base^2 */
962         /*      if (exp bit is 1) */
963         /*        result = result * base */
964         /*  This ends up doing the same thing but skips two wasteful steps of multiplying by 1 and */
965         /*  a final squaring never used. */
966         /*  */
967         /*  Next we have the problem that CIOS mul needs a separate dest buffer. So, we bounce */
968         /*  base between base and temp, and likewise for result. */
969         MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase],
970                         &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
971                         &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], pubkey, np64);
972         if (0U != (exp_pubkey & (uint32_t)(uint8_t)(1U << (uint8_t)bitpos))) /*  where e is 1 */
973         {
974             /*  result has result, so we need to work into other temp area */
975             MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
976                             &MsgRet[kCASPER_RamOffset_Result],
977                             &MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase], pubkey, np64);
978             /*  we have to copy back to result */
979 
980             // CASPER_MEMCPY_I2I(&MsgRet[kCASPER_RamOffset_Result],
981             //      &MsgRet[bidx ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], N_bytelen);
982         }
983         else
984         {
985             bidx = (int32_t)(uint32_t) ~(unsigned)bidx;
986         }
987     }
988 
989     CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
990                       (uint32_t *)(uintptr_t)&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
991                       N_bytelen);
992 
993     /*  final step is one more reduction to get back to normal form (ie. divide R out) */
994     MultprecCiosMul(&MsgRet[kCASPER_RamOffset_Result], NULL, NULL, pubkey, np64);
995     return (0); /*  always 0 */
996 }
997 
998 /*  RSA_SignatureToPlaintextFast: */
999 /*  MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
1000 /*  exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
1001 /*  signature = N bitpos len long "message" to process in normal form - so converted to Mont form */
1002 /*  pubkey = N bitpos len long public key to process signature with */
1003 /*  returns: 0 */
1004 /*  */
1005 /*  Algo: compute M = signaturen^e mod public_key */
1006 /*        where M is original plaintext, signature is signed value */
1007 /*        note: e is usually either 0x3 or 0x10001 */
RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],const unsigned exp_pubkey,const unsigned pubkey[N_wordlen_max],unsigned MsgRet[WORK_BUFF_MUL4])1008 int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
1009                                  const unsigned exp_pubkey,
1010                                  const unsigned pubkey[N_wordlen_max],
1011                                  unsigned MsgRet[WORK_BUFF_MUL4])
1012 {
1013     /*  MsgRet working area: */
1014     /*  0..N = RESULT, starting with S`; it is used for R` just during creation of S` */
1015     /*  N..N*2 = S` and then working BASE during math. Note overflow beyond N*2 when making S` */
1016     /*  N*2..N*4+2 = temp working area for Mont mul */
1017 
1018     MultprecMontCalcRp(&MsgRet[kCASPER_RamOffset_Result], exp_pubkey, pubkey); /*  calculate R` (=R mod N) */
1019     MultprecMontPrepareX(&MsgRet[kCASPER_RamOffset_Base], signature, &MsgRet[kCASPER_RamOffset_Result],
1020                          pubkey); /*  X*R1` mod N */
1021     return (RSA_MontSignatureToPlaintextFast(&MsgRet[kCASPER_RamOffset_Base], exp_pubkey, pubkey, MsgRet));
1022 }
1023 
1024 /*!
1025  * brief Performs modular exponentiation - (A^E) mod N.
1026  *
1027  * This function performs modular exponentiation.
1028  *
1029  * param base CASPER base address
1030  * param signature first addend (in little endian format)
1031  * param pubN modulus (in little endian format)
1032  * param wordLen Size of pubN in bytes
1033  * param pubE exponent
1034  * param[out] plaintext Output array to store result of operation (in little endian format)
1035  */
CASPER_ModExp(CASPER_Type * base,const uint8_t * signature,const uint8_t * pubN,size_t wordLen,uint32_t pubE,uint8_t * plaintext)1036 void CASPER_ModExp(
1037     CASPER_Type *base, const uint8_t *signature, const uint8_t *pubN, size_t wordLen, uint32_t pubE, uint8_t *plaintext)
1038 {
1039 #define PK_LOC  &msg_ret[kCASPER_RamOffset_Modulus]
1040 #define SIG_LOC &msg_ret[(unsigned)kCASPER_RamOffset_Modulus + N_wordlen_max]
1041 
1042     N_wordlen = wordLen; /* set global variable for key length - used by RSA_SignatureToPlaintextFast()  */
1043     CASPER_MEMCPY_N2I(PK_LOC, (const uint32_t *)(uintptr_t)pubN, N_bytelen);
1044     CASPER_MEMCPY_N2I(SIG_LOC, (const uint32_t *)(uintptr_t)signature, N_bytelen);
1045     (void)RSA_SignatureToPlaintextFast((const unsigned *)(uintptr_t)(SIG_LOC), pubE,
1046                                        (const unsigned *)(uintptr_t)(PK_LOC), (unsigned int *)(uintptr_t)msg_ret);
1047 
1048     CASPER_MEMCPY_I2N((uint32_t *)(uintptr_t)plaintext, msg_ret, N_bytelen);
1049 }
1050 
1051 /*!
1052  * brief Enables clock and disables reset for CASPER peripheral.
1053  *
1054  * Enable clock and disable reset for CASPER.
1055  *
1056  * param base CASPER base address
1057  */
CASPER_Init(CASPER_Type * base)1058 void CASPER_Init(CASPER_Type *base)
1059 {
1060 #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
1061 #if defined(CASPER_CLOCKS)
1062     CLOCK_EnableClock(kCLOCK_Casper);
1063 #endif
1064 #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
1065 #if defined(CASPER_RSTS)
1066     RESET_PeripheralReset(kCASPER_RST_SHIFT_RSTn);
1067 #endif
1068 #if defined(FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE) && (FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE > 0)
1069     /* Enable hardware interleaving to RAMX0 and RAMX1 for CASPER */
1070     SYSCON->CASPER_CTRL = SYSCON_CASPER_CTRL_INTERLEAVE(1);
1071 #endif /* FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE */
1072     /* If Casper init is called with secure address, use secure addres also for accessing Casper RAM. */
1073     s_casperRamBase = (unsigned)CASPER_RAM_BASE_NS | ((uint32_t)base & 0x10000000u);
1074     msg_ret         = (uint32_t *)s_casperRamBase;
1075 }
1076 
1077 /*!
1078  * brief Disables clock for CASPER peripheral.
1079  *
1080  * Disable clock and enable reset.
1081  *
1082  * param base CASPER base address
1083  */
CASPER_Deinit(CASPER_Type * base)1084 void CASPER_Deinit(CASPER_Type *base)
1085 {
1086 #if defined(CASPER_RSTS)
1087     RESET_SetPeripheralReset(kCASPER_RST_SHIFT_RSTn);
1088 #endif
1089 #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
1090 #if defined(CASPER_CLOCKS)
1091     CLOCK_DisableClock(kCLOCK_Casper);
1092 #endif
1093 #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
1094 }
1095 
1096 /* New ECC code which uses Casper. */
1097 
1098 /* Set the prime modulus mod in Casper memory.
1099  */
CASPER_ecc_init(casper_algo_t curve)1100 void CASPER_ecc_init(casper_algo_t curve)
1101 {
1102     uint32_t *mod;
1103 
1104     if (curve == kCASPER_ECC_P256)
1105     {
1106         N_wordlen = 256U / 32U;
1107         mod       = NISTp256;
1108     }
1109 
1110     if (curve == kCASPER_ECC_P384)
1111     {
1112         N_wordlen = 384U / 32U;
1113         mod       = NISTp384;
1114     }
1115 
1116     if (curve == kCASPER_ECC_P521)
1117     {
1118         N_wordlen = 576U / 32U;
1119         mod       = NISTp521;
1120     }
1121 
1122     CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U)], mod, N_wordlen * sizeof(uint32_t));
1123     uint8_t a[((CASPER_MAX_ECC_SIZE_WORDLEN + 4U) - CASPER_MAX_ECC_SIZE_WORDLEN) * sizeof(uint32_t)] = {0};
1124     CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U) + N_wordlen], a, ((N_wordlen + 4U) - N_wordlen) * sizeof(uint32_t));
1125 }
1126 
CASPER_ECC_equal(int * res,uint32_t * op1,uint32_t * op2)1127 void CASPER_ECC_equal(int *res, uint32_t *op1, uint32_t *op2)
1128 {
1129     uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1130     uint32_t b[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1131     uint32_t c                              = 0;
1132     CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
1133     CASPER_MEMCPY(b, op2, N_wordlen * sizeof(uint32_t));
1134 
1135     do
1136     {
1137         uint32_t _i;
1138         c = (a[0] ^ b[0]);
1139         for (_i = 1; _i < N_wordlen; _i++)
1140         {
1141             c |= (a[_i] ^ b[_i]);
1142         }
1143     } while (false);
1144 
1145     *res = (int32_t)c;
1146 }
1147 
CASPER_ECC_equal_to_zero(int * res,uint32_t * op1)1148 void CASPER_ECC_equal_to_zero(int *res, uint32_t *op1)
1149 {
1150     uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
1151     uint32_t c                              = 0;
1152     CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
1153 
1154     do
1155     {
1156         uint32_t _i;
1157         c = a[0];
1158         for (_i = 1; _i < N_wordlen; _i++)
1159         {
1160             c |= a[_i];
1161         }
1162     } while (false);
1163 
1164     *res = (int32_t)c;
1165 }
1166 
CASPER_ECC_SECP256R1_Mul(CASPER_Type * base,uint32_t resX[8],uint32_t resY[8],uint32_t X[8],uint32_t Y[8],uint32_t scalar[8])1167 void CASPER_ECC_SECP256R1_Mul(
1168     CASPER_Type *base, uint32_t resX[8], uint32_t resY[8], uint32_t X[8], uint32_t Y[8], uint32_t scalar[8])
1169 {
1170     uint32_t X1[8] = {0};
1171     uint32_t Y1[8] = {0};
1172     toMontgomery_ECC_P256(X1, X);
1173     toMontgomery_ECC_P256(Y1, Y);
1174 
1175     CASPER_MEMCPY(
1176         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1177         X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1178     CASPER_MEMCPY(
1179         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1180         Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1181 
1182     Jac_scalar_multiplication(
1183         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1184         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1185         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1186         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1187         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1188         scalar, NISTp256, NISTp256_q);
1189 
1190     Jac_toAffine(
1191         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1192         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1193         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1194         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1195         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1196 
1197     /* Montgomery to Normal */
1198     /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1199     uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
1200     one[0]                                        = 0x1u;
1201     CASPER_MEMCPY(
1202         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1203         one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1204     multiply_casper(
1205         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1206         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1207         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1208     multiply_casper(
1209         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1210         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1211         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1212 
1213     /* copy out to result */
1214     CASPER_MEMCPY(
1215         resX,
1216         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1217         (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1218     CASPER_MEMCPY(
1219         resY,
1220         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1221         (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1222 }
1223 
CASPER_ECC_SECP256R1_MulAdd(CASPER_Type * base,uint32_t resX[8],uint32_t resY[8],uint32_t X1[8],uint32_t Y1[8],uint32_t scalar1[8],uint32_t X2[8],uint32_t Y2[8],uint32_t scalar2[8])1224 void CASPER_ECC_SECP256R1_MulAdd(CASPER_Type *base,
1225                                  uint32_t resX[8],
1226                                  uint32_t resY[8],
1227                                  uint32_t X1[8],
1228                                  uint32_t Y1[8],
1229                                  uint32_t scalar1[8],
1230                                  uint32_t X2[8],
1231                                  uint32_t Y2[8],
1232                                  uint32_t scalar2[8])
1233 {
1234     uint32_t zeroes[(kCASPER_ECC_P256_wordlen + 4U)] = {0};
1235 
1236     CASPER_MEMCPY(
1237         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1238         X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1239     CASPER_MEMCPY(
1240         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1241         Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1242 
1243     CASPER_MEMCPY(
1244         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1245         X2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1246     CASPER_MEMCPY(
1247         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1248         Y2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1249 
1250     toMontgomery_ECC_P256(
1251         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1252         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1253     toMontgomery_ECC_P256(
1254         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1255         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1256     toMontgomery_ECC_P256(
1257         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1258         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1259     toMontgomery_ECC_P256(
1260         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1261         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1262 
1263     CASPER_MEMCPY(
1264         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1265         zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1266     CASPER_MEMCPY(
1267         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1268         zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1269     CASPER_MEMCPY(
1270         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1271         zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1272     double_scalar_multiplication(
1273         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1274         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1275         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1276         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1277         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1278         scalar1,
1279         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1280         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1281         scalar2);
1282 
1283     Jac_toAffine(
1284         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1285         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1286         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1287         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1288         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1289 
1290     uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
1291     one[0]                                        = 0x1u;
1292     CASPER_MEMCPY(
1293         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1294         one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
1295     multiply_casper(
1296         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1297         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1298         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1299     multiply_casper(
1300         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1301         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
1302         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
1303 
1304     CASPER_MEMCPY(resX,
1305                   (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
1306                                1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
1307                   (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1308     CASPER_MEMCPY(resY,
1309                   (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
1310                                2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
1311                   (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
1312 }
1313 
CASPER_ECC_SECP384R1_Mul(CASPER_Type * base,uint32_t resX[12],uint32_t resY[12],uint32_t X[12],uint32_t Y[12],uint32_t scalar[12])1314 void CASPER_ECC_SECP384R1_Mul(
1315     CASPER_Type *base, uint32_t resX[12], uint32_t resY[12], uint32_t X[12], uint32_t Y[12], uint32_t scalar[12])
1316 {
1317     uint32_t X1[12] = {0};
1318     uint32_t Y1[12] = {0};
1319     toMontgomery_ECC_P384(X1, X);
1320     toMontgomery_ECC_P384(Y1, Y);
1321 
1322     CASPER_MEMCPY(
1323         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1324         X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1325     CASPER_MEMCPY(
1326         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1327         Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1328 
1329     Jac_scalar_multiplication(
1330         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1331         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1332         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1333         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1334         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1335         scalar, NISTp384, NISTp384_q);
1336 
1337     Jac_toAffine(
1338         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1339         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1340         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1341         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1342         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1343 
1344     /* Montgomery to Normal */
1345     /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1346     uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1347     CASPER_MEMCPY(
1348         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1349         one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1350     multiply_casper(
1351         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1352         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1353         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1354     multiply_casper(
1355         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1356         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1357         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1358 
1359     /* copy out to result */
1360     CASPER_MEMCPY(
1361         resX,
1362         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1363         (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1364     CASPER_MEMCPY(
1365         resY,
1366         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1367         (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1368 }
1369 
CASPER_ECC_SECP384R1_MulAdd(CASPER_Type * base,uint32_t resX[12],uint32_t resY[12],uint32_t X1[12],uint32_t Y1[12],uint32_t scalar1[12],uint32_t X2[12],uint32_t Y2[12],uint32_t scalar2[12])1370 void CASPER_ECC_SECP384R1_MulAdd(CASPER_Type *base,
1371                                  uint32_t resX[12],
1372                                  uint32_t resY[12],
1373                                  uint32_t X1[12],
1374                                  uint32_t Y1[12],
1375                                  uint32_t scalar1[12],
1376                                  uint32_t X2[12],
1377                                  uint32_t Y2[12],
1378                                  uint32_t scalar2[12])
1379 {
1380     CASPER_MEMCPY(
1381         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1382         X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1383     CASPER_MEMCPY(
1384         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1385         Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1386 
1387     CASPER_MEMCPY(
1388         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1389         X2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1390     CASPER_MEMCPY(
1391         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1392         Y2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1393 
1394     toMontgomery_ECC_P384(
1395         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1396         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1397     toMontgomery_ECC_P384(
1398         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1399         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1400     toMontgomery_ECC_P384(
1401         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1402         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1403     toMontgomery_ECC_P384(
1404         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1405         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1406 
1407     double_scalar_multiplication(
1408         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1409         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1410         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1411         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1412         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1413         scalar1,
1414         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1415         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1416         scalar2);
1417 
1418     Jac_toAffine(
1419         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1420         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1421         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1422         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1423         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1424 
1425     uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1426     CASPER_MEMCPY(
1427         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1428         one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1429     multiply_casper(
1430         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1431         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1432         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1433     multiply_casper(
1434         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1435         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
1436         &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
1437 
1438     CASPER_MEMCPY(resX,
1439                   (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
1440                                1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
1441                   (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1442     CASPER_MEMCPY(resY,
1443                   (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
1444                                2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
1445                   (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
1446 }
1447 
CASPER_ECC_SECP521R1_Mul(CASPER_Type * base,uint32_t resX[18],uint32_t resY[18],uint32_t X[18],uint32_t Y[18],uint32_t scalar[18])1448 void CASPER_ECC_SECP521R1_Mul(
1449     CASPER_Type *base, uint32_t resX[18], uint32_t resY[18], uint32_t X[18], uint32_t Y[18], uint32_t scalar[18])
1450 {
1451     uint32_t X1[18] = {0};
1452     uint32_t Y1[18] = {0};
1453     toMontgomery_ECC_P521(X1, X);
1454     toMontgomery_ECC_P521(Y1, Y);
1455 
1456     CASPER_MEMCPY(
1457         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1458                     0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1459         X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1460     CASPER_MEMCPY(
1461         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1462                     1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1463         Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1464 
1465     Jac_scalar_multiplication(
1466         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1467                     6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1468         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1469                     7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1470         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1471                     8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1472         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1473                     0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1474         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1475                     1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1476         scalar, NISTp521, NISTp521_q);
1477 
1478     Jac_toAffine(
1479         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1480                     3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1481         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1482                     4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1483         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1484                     6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1485         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1486                     7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1487         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1488                     8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1489 
1490     /* Montgomery to Normal */
1491     /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
1492     uint32_t one[18] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
1493     CASPER_MEMCPY(
1494         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1495                     0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1496         one, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1497     multiply_casper(
1498         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1499                     5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1500         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1501                     3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1502         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1503                     0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1504     multiply_casper(
1505         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1506                     6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1507         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1508                     4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1509         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1510                     0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1511 
1512     /* copy out to result */
1513     CASPER_MEMCPY(
1514         resX,
1515         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1516                     5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1517         (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1518     CASPER_MEMCPY(
1519         resY,
1520         &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
1521                     6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1522         (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1523 }
1524 
CASPER_ECC_SECP521R1_MulAdd(CASPER_Type * base,uint32_t resX[18],uint32_t resY[18],uint32_t X1[18],uint32_t Y1[18],uint32_t scalar1[18],uint32_t X2[18],uint32_t Y2[18],uint32_t scalar2[18])1525 void CASPER_ECC_SECP521R1_MulAdd(CASPER_Type *base,
1526                                  uint32_t resX[18],
1527                                  uint32_t resY[18],
1528                                  uint32_t X1[18],
1529                                  uint32_t Y1[18],
1530                                  uint32_t scalar1[18],
1531                                  uint32_t X2[18],
1532                                  uint32_t Y2[18],
1533                                  uint32_t scalar2[18])
1534 {
1535     uint32_t zeroes[(kCASPER_ECC_P521_wordlen + 4U)] = {0};
1536 
1537     CASPER_MEMCPY(
1538         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1539         X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1540     CASPER_MEMCPY(
1541         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1542         Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1543 
1544     CASPER_MEMCPY(
1545         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1546         X2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1547     CASPER_MEMCPY(
1548         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1549         Y2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1550 
1551     toMontgomery_ECC_P521(
1552         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1553         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1554     toMontgomery_ECC_P521(
1555         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1556         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1557     toMontgomery_ECC_P521(
1558         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1559         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1560     toMontgomery_ECC_P521(
1561         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1562         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1563 
1564     CASPER_MEMCPY(
1565         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1566         zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1567     CASPER_MEMCPY(
1568         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1569         zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1570     CASPER_MEMCPY(
1571         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1572         zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1573     double_scalar_multiplication(
1574         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1575         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1576         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1577         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1578         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1579         scalar1,
1580         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1581         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1582         scalar2);
1583 
1584     Jac_toAffine(
1585         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1586         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1587         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1588         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1589         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1590 
1591     uint32_t one[(kCASPER_ECC_P521_wordlen + 4U)] = {0x0};
1592     one[0]                                        = 0x1u;
1593     CASPER_MEMCPY(
1594         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1595         one, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
1596     multiply_casper(
1597         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1598         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1599         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1600     multiply_casper(
1601         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) +
1602                     2U * ((uint32_t)(uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1603         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
1604         &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
1605 
1606     CASPER_MEMCPY(
1607         resX,
1608         (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
1609         (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1610     CASPER_MEMCPY(
1611         resY,
1612         (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
1613         (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
1614 }
1615 
1616 // CIOS Multiply. This is the Coarse Integrated form where the values are
1617 // multiplied and reduced for each step of "i". This uses less memory and
1618 // is faster as a result. Note that this is used to square as well as mul,
1619 // so not as fast as pure squaring could be.
MultprecCiosMul_ct(uint32_t w_out[],const uint32_t a[],const uint32_t b[],const uint32_t Nmod[],const uint32_t * Np)1620 static void MultprecCiosMul_ct(
1621     uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
1622 {
1623     uint32_t j;
1624     uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
1625     uint64_t Np64;
1626     uint64_t carry;
1627     uint64_t *a64, *b64, *w64, *N64;
1628     uint32_t *T1 = &CASPER_MEM[0], borrow;
1629 
1630     Np64 = *(uint64_t *)(uintptr_t)Np;
1631 
1632     a64 = (uint64_t *)(uintptr_t)a;
1633     b64 = (uint64_t *)(uintptr_t)b;
1634     w64 = (uint64_t *)(uintptr_t)w_out;
1635     N64 = (uint64_t *)(uintptr_t)Nmod;
1636 
1637     if (a != NULL)
1638     { /*  if !a, we are reducing only */
1639         PreZeroW(j, w_out);
1640     }
1641     SET_DWORD(&w64[N_dwordlen], 0ULL);
1642     SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
1643     /*  with accelerator */
1644 
1645     /*  loop j and then reduce after each j round */
1646     for (j = 0; j < N_dwordlen; j++)
1647     {
1648         /*  Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
1649         /*  push-pull: we do a*b and then separately m*n (reduce) */
1650         if (a != NULL)
1651         { /*  if mul&reduce vs. reduce only */
1652             carry = GET_DWORD(&w64[N_dwordlen]);
1653             Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
1654             Accel_crypto_mul(
1655                 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
1656             Accel_done();
1657             /*  max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
1658             /*  so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
1659             /*  accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
1660             /*  w64[N_dwordlen+1] = g_carry; */
1661             carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
1662             SET_DWORD(&w64[N_dwordlen + 1U], carry);
1663         }
1664         SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /*  prime for 1st; modulo a double-word */
1665 
1666         /*  we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
1667         /*  need its carry. We let the accel do this separately - really need a mode to */
1668         /*  do this "reduce" since it is natural */
1669         carry = GET_DWORD(&w64[N_dwordlen]);
1670         Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
1671         Accel_crypto_mul(
1672             Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
1673         Accel_done();
1674         carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
1675 
1676         Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
1677         Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
1678 
1679         Accel_done();
1680         SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
1681     }
1682 
1683     /*  now check if need to subtract Nmod */
1684     CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
1685 
1686     /* Compute w = w - N */
1687     Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
1688     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
1689     Accel_done();
1690 
1691     // if w_out > T1 then there was a borrow
1692     borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
1693 
1694     SET_WORD(&w_out[N_wordlen + 1U], 0);
1695     SET_WORD(&w_out[N_wordlen], 0);
1696     casper_select(w_out, w_out, T1, (int32_t)borrow, (int16_t)(uint16_t)N_wordlen);
1697 }
1698 
1699 /* Compute C = A - B % mod
1700  * Assumes all operand have two extra limbs to store carry.
1701  */
CASPER_montsub(uint32_t * C,uint32_t * A,uint32_t * B,uint32_t * mod)1702 static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
1703 {
1704     uint64_t *b64, *c64, *m64, *tmp;
1705     int borrow;
1706 
1707     b64 = (uint64_t *)(uintptr_t)B;
1708     c64 = (uint64_t *)(uintptr_t)C;
1709     m64 = (uint64_t *)(uintptr_t)mod;
1710 
1711     tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
1712 
1713     CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
1714 
1715     /* Compute tmp = A - B. */
1716     Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
1717 
1718     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(tmp)));
1719     Accel_done();
1720 
1721     borrow = (int32_t)((GET_WORD(&((uint32_t *)(uintptr_t)tmp)[N_wordlen - 1U])) > GET_WORD(&A[N_wordlen - 1U]));
1722     CASPER_MEMCPY(c64, tmp, N_wordlen * sizeof(uint32_t));
1723 
1724     /* Compute C = Mod + tmp */
1725     Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
1726     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(c64)));
1727     Accel_done();
1728 
1729     casper_select(C, (uint32_t *)(uintptr_t)tmp, C, borrow, (int16_t)(uint16_t)N_wordlen);
1730 }
1731 
1732 /* Compute C = A + B % mod
1733  * Assumes all operand have two extra limbs to store carry.
1734  */
CASPER_montadd(uint32_t * C,uint32_t * A,uint32_t * B,uint32_t * mod)1735 static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
1736 {
1737     uint64_t *b64, *c64, *m64, *tmp;
1738     int borrow;
1739 
1740     b64 = (uint64_t *)(uintptr_t)B;
1741     c64 = (uint64_t *)(uintptr_t)C;
1742     m64 = (uint64_t *)(uintptr_t)mod;
1743 
1744     tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
1745 
1746     CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
1747     SET_DWORD(&tmp[N_wordlen / 2U], 0ULL);
1748     SET_DWORD(&b64[N_wordlen / 2U], 0ULL);
1749     SET_DWORD(&m64[N_wordlen / 2U], 0ULL);
1750 
1751     /* Compute tmp = A + B using one additonal double-length limb. */
1752     Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
1753 
1754     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(tmp)));
1755     Accel_done();
1756 
1757     CASPER_MEMCPY(c64, tmp, (N_wordlen + 2U) * sizeof(uint32_t));
1758 
1759     /* Compute C = Mod - tmp */
1760     Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
1761     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(c64)));
1762     Accel_done();
1763 
1764     // borrow = g_carry;
1765     borrow = (int32_t)(GET_WORD(&C[N_wordlen]) > GET_WORD(&(((uint32_t *)(uintptr_t)tmp)[N_wordlen])));
1766     casper_select(C, C, (uint32_t *)(uintptr_t)tmp, borrow, (int16_t)(uint16_t)N_wordlen);
1767 }
1768 
1769 /* Compute c = a/2 mod p where b is scratch space. */
CASPER_half(uint32_t * c,uint32_t * a,uint32_t * b)1770 static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b)
1771 {
1772     shiftright(b, a, 1U); /* Compute a/2 and (a+p)/2       */
1773 
1774     /* Compute tmp = a + p using one additonal double-length limb. */
1775     CASPER_MEMCPY(c, a, N_wordlen * sizeof(uint32_t));
1776     SET_WORD(&c[N_wordlen], 0);
1777     SET_WORD(&c[N_wordlen + 1U], 0U);
1778 
1779     Accel_SetABCD_Addr(CA_MK_OFF(((uint64_t *)(uintptr_t)&CASPER_MEM[(N_wordlen + 4U)])), 0);
1780     Accel_crypto_mul(
1781         Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(((uint64_t *)(uintptr_t)c))));
1782     Accel_done();
1783 
1784     shiftright(c, c, 1U);
1785     SET_WORD(&c[N_wordlen - 1U], GET_WORD(&c[N_wordlen - 1U]) | (GET_WORD(&c[N_wordlen]) << 31));
1786     SET_WORD(&c[N_wordlen], 0U);
1787     casper_select(c, b, c, (int32_t)(uint32_t)(GET_WORD(&a[0]) & 1U), (int16_t)(uint16_t)(N_wordlen));
1788 }
1789 
casper_get_word(uint32_t * addr)1790 static uint32_t casper_get_word(uint32_t *addr)
1791 {
1792     return GET_WORD(addr);
1793 }
1794 
1795 /* Shift right by 1 <= c <= 31. z[] and x[] in system RAM, no interleaving macros used. */
shiftrightSysram(uint32_t * z,uint32_t * x,uint32_t c)1796 static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c)
1797 {
1798     z[0] = (x[1] << (32U - (c))) | (x[0] >> (c));
1799     z[1] = (x[2] << (32U - (c))) | (x[1] >> (c));
1800     z[2] = (x[3] << (32U - (c))) | (x[2] >> (c));
1801     z[3] = (x[4] << (32U - (c))) | (x[3] >> (c));
1802     z[4] = (x[5] << (32U - (c))) | (x[4] >> (c));
1803     z[5] = (x[6] << (32U - (c))) | (x[5] >> (c));
1804     z[6] = (x[7] << (32U - (c))) | (x[6] >> (c));
1805 
1806     if (N_wordlen == 18U)
1807     {
1808         z[7]  = (x[8] << (32U - (c))) | (x[7] >> (c));
1809         z[8]  = (x[9] << (32U - (c))) | (x[8] >> (c));
1810         z[9]  = (x[10] << (32U - (c))) | (x[9] >> (c));
1811         z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
1812         z[11] = (x[12] << (32U - (c))) | (x[11] >> (c));
1813         z[12] = (x[13] << (32U - (c))) | (x[12] >> (c));
1814         z[13] = (x[14] << (32U - (c))) | (x[13] >> (c));
1815         z[14] = (x[15] << (32U - (c))) | (x[14] >> (c));
1816         z[15] = (x[16] << (32U - (c))) | (x[15] >> (c));
1817         z[16] = (x[17] << (32U - (c))) | (x[16] >> (c));
1818         z[17] = (x[17] >> (c));
1819     }
1820 
1821     if (N_wordlen == 12U)
1822     {
1823         z[7]  = (x[8] << (32U - (c))) | (x[7] >> (c));
1824         z[8]  = (x[9] << (32U - (c))) | (x[8] >> (c));
1825         z[9]  = (x[10] << (32U - (c))) | (x[9] >> (c));
1826         z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
1827         z[11] = (x[11] >> (c));
1828     }
1829     if (N_wordlen == 8U)
1830     {
1831         z[7] = (x[7] >> (c));
1832     }
1833 }
1834 /* Shift right by 1 <= c <= 31. */
shiftright(uint32_t * z,uint32_t * x,uint32_t c)1835 static void shiftright(uint32_t *z, uint32_t *x, uint32_t c)
1836 {
1837     SET_WORD(&z[0], (GET_WORD(&x[1]) << (32U - (c))) | (GET_WORD(&x[0]) >> (c)));
1838     SET_WORD(&z[1], (GET_WORD(&x[2]) << (32U - (c))) | (GET_WORD(&x[1]) >> (c)));
1839     SET_WORD(&z[2], (GET_WORD(&x[3]) << (32U - (c))) | (GET_WORD(&x[2]) >> (c)));
1840     SET_WORD(&z[3], (GET_WORD(&x[4]) << (32U - (c))) | (GET_WORD(&x[3]) >> (c)));
1841     SET_WORD(&z[4], (GET_WORD(&x[5]) << (32U - (c))) | (GET_WORD(&x[4]) >> (c)));
1842     SET_WORD(&z[5], (GET_WORD(&x[6]) << (32U - (c))) | (GET_WORD(&x[5]) >> (c)));
1843     SET_WORD(&z[6], (GET_WORD(&x[7]) << (32U - (c))) | (GET_WORD(&x[6]) >> (c)));
1844 
1845     if (N_wordlen == 18U)
1846     {
1847         SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
1848         SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
1849         SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
1850         SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
1851         SET_WORD(&z[11], (GET_WORD(&x[12]) << (32U - (c))) | (GET_WORD(&x[11]) >> (c)));
1852         SET_WORD(&z[12], (GET_WORD(&x[13]) << (32U - (c))) | (GET_WORD(&x[12]) >> (c)));
1853         SET_WORD(&z[13], (GET_WORD(&x[14]) << (32U - (c))) | (GET_WORD(&x[13]) >> (c)));
1854         SET_WORD(&z[14], (GET_WORD(&x[15]) << (32U - (c))) | (GET_WORD(&x[14]) >> (c)));
1855         SET_WORD(&z[15], (GET_WORD(&x[16]) << (32U - (c))) | (GET_WORD(&x[15]) >> (c)));
1856         SET_WORD(&z[16], (GET_WORD(&x[17]) << (32U - (c))) | (GET_WORD(&x[16]) >> (c)));
1857         SET_WORD(&z[17], (GET_WORD(&x[17]) >> (c)));
1858     }
1859     if (N_wordlen == 12U)
1860     {
1861         SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
1862         SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
1863         SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
1864         SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
1865         SET_WORD(&z[11], (GET_WORD(&x[11]) >> (c)));
1866     }
1867     if (N_wordlen == 8U)
1868     {
1869         SET_WORD((&z[7]), (GET_WORD(&x[7]) >> (c)));
1870     }
1871 }
1872 /* Shift left by 1 <= c <= 31. */
shiftleft(uint32_t * z,uint32_t * x,uint32_t c)1873 static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c)
1874 {
1875     if (N_wordlen == 18U)
1876     {
1877         SET_WORD(&z[17], (GET_WORD(&x[17]) << (c)) | GET_WORD(&z[16]) >> (32U - (c)));
1878         SET_WORD(&z[16], (GET_WORD(&x[16]) << (c)) | GET_WORD(&z[15]) >> (32U - (c)));
1879         SET_WORD(&z[15], (GET_WORD(&x[15]) << (c)) | GET_WORD(&z[14]) >> (32U - (c)));
1880         SET_WORD(&z[14], (GET_WORD(&x[14]) << (c)) | GET_WORD(&z[13]) >> (32U - (c)));
1881         SET_WORD(&z[13], (GET_WORD(&x[13]) << (c)) | GET_WORD(&z[12]) >> (32U - (c)));
1882         SET_WORD(&z[12], (GET_WORD(&x[12]) << (c)) | GET_WORD(&z[11]) >> (32U - (c)));
1883         SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
1884         SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
1885         SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
1886         SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
1887     }
1888     if (N_wordlen == 12U)
1889     {
1890         SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
1891         SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
1892         SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
1893         SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
1894     }
1895     SET_WORD(&z[7], (GET_WORD(&x[7]) << (c)) | GET_WORD(&z[6]) >> (32U - (c)));
1896     SET_WORD(&z[6], (GET_WORD(&x[6]) << (c)) | GET_WORD(&z[5]) >> (32U - (c)));
1897     SET_WORD(&z[5], (GET_WORD(&x[5]) << (c)) | GET_WORD(&z[4]) >> (32U - (c)));
1898     SET_WORD(&z[4], (GET_WORD(&x[4]) << (c)) | GET_WORD(&z[3]) >> (32U - (c)));
1899     SET_WORD(&z[3], (GET_WORD(&x[3]) << (c)) | GET_WORD(&z[2]) >> (32U - (c)));
1900     SET_WORD(&z[2], (GET_WORD(&x[2]) << (c)) | GET_WORD(&z[1]) >> (32U - (c)));
1901     SET_WORD(&z[1], (GET_WORD(&x[1]) << (c)) | GET_WORD(&z[0]) >> (32U - (c)));
1902     SET_WORD(&z[0], (GET_WORD(&x[0]) << (c)));
1903 }
1904 
multiply_casper(uint32_t w_out[],const uint32_t a[],const uint32_t b[])1905 static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[])
1906 {
1907     uint32_t *Np;
1908 
1909     if (N_wordlen == 8U)
1910     {
1911         Np = Np256;
1912         MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1913     }
1914     if (N_wordlen == 12U)
1915     {
1916         Np = Np384;
1917         MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1918     }
1919 
1920     if (N_wordlen == 18U)
1921     {
1922         Np = Np521;
1923         MultprecCiosMul521_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
1924     }
1925 }
1926 /* Convert a projective point (X1 : Y1 : Z1)
1927  * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
1928  * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
1929  */
Jac_toAffine(uint32_t * X3,uint32_t * Y3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1)1930 void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
1931 {
1932     uint32_t *T1, *T2;
1933 
1934     T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
1935     T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
1936 
1937     square_casper(T1, Z1);       // Z^2
1938     multiply_casper(T2, T1, Z1); // Z^3
1939 
1940     // Montgomery inverse
1941     if (N_wordlen == 8U)
1942     {
1943         invert_mod_p256(T1, T2);
1944     }
1945 
1946     if (N_wordlen == 12U)
1947     {
1948         invert_mod_p384(T1, T2);
1949     }
1950 
1951     if (N_wordlen == 18U)
1952     {
1953         invert_mod_p521(T1, T2);
1954     }
1955 
1956     multiply_casper(Y3, Y1, T1); // Y3 = Y/Z^3
1957     multiply_casper(T2, T1, Z1); // Z^-2
1958     multiply_casper(X3, X1, T2); // X3 = X/Z^2
1959 }
1960 
1961 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
1962  *  where (X1: Y1: Z1) != (X2 : Y2 : Z2)
1963  * (X3 : Y3: Z3) may be the same as one of the inputs.
1964  */
Jac_addition(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1,uint32_t * X2,uint32_t * Y2,uint32_t * Z2)1965 void Jac_addition(uint32_t *X3,
1966                   uint32_t *Y3,
1967                   uint32_t *Z3,
1968                   uint32_t *X1,
1969                   uint32_t *Y1,
1970                   uint32_t *Z1,
1971                   uint32_t *X2,
1972                   uint32_t *Y2,
1973                   uint32_t *Z2)
1974 {
1975     uint32_t *Z1Z1, *Z2Z2, *U1, *S1, *J, *H, *V, *t0, *t1;
1976     int m1, m2;
1977 
1978     Z1Z1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
1979     Z2Z2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
1980     U1   = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
1981     S1   = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
1982     J    = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
1983     H    = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
1984     V    = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
1985     t0   = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
1986     t1   = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
1987 
1988     CASPER_ECC_equal_to_zero(&m1, Z1);
1989     CASPER_ECC_equal_to_zero(&m2, Z2);
1990     if (m1 == 0)
1991     {
1992         CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
1993         CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
1994         CASPER_MEMCPY(Z3, Z2, N_wordlen * 4U);
1995         return;
1996     }
1997     if (m2 == 0)
1998     {
1999         CASPER_MEMCPY(X3, X1, N_wordlen * 4U);
2000         CASPER_MEMCPY(Y3, Y1, N_wordlen * 4U);
2001         CASPER_MEMCPY(Z3, Z1, N_wordlen * 4U);
2002         return;
2003     }
2004 
2005     square_casper(Z1Z1, Z1);
2006     square_casper(Z2Z2, Z2);
2007     multiply_casper(U1, X1, Z2Z2);
2008     multiply_casper(H, X2, Z1Z1); /* if H equals U1 then X's are the same */
2009     multiply_casper(t0, Z2, Z2Z2);
2010     multiply_casper(S1, Y1, t0);
2011     multiply_casper(t0, Z1, Z1Z1);
2012     multiply_casper(J, Y2, t0);   /* if (S1 == J) then Y's are the same */
2013 
2014     CASPER_ECC_equal(&m1, H, U1); /* If H and U1 match then the X-coordinates are the same. */
2015     CASPER_ECC_equal(&m2, S1, J); /* If S1 and J match then the Y-coordinates are the same. */
2016     if (m1 == 0)
2017     {
2018         if (m2 == 0)
2019         {
2020             Jac_double(X3, Y3, Z3, X1, Y1, Z1);
2021             return;
2022         }
2023         /* else {
2024         We work with the point at infinity.
2025         The Z-coordinate will be set to zero in this function.
2026         } */
2027     }
2028 
2029     sub_casper(H, H, U1);
2030     mul2_casper(t0, H);
2031     square_casper(t1, t0);
2032     sub_casper(t0, J, S1);
2033     multiply_casper(J, H, t1);
2034     multiply_casper(V, U1, t1);
2035     mul2_casper(U1, t0);
2036     square_casper(t0, U1);
2037     mul2_casper(t1, V);
2038     sub_casper(t0, t0, J);
2039     sub_casper(X3, t0, t1);
2040     sub_casper(t0, V, X3);
2041     multiply_casper(t1, S1, J);
2042     mul2_casper(t1, t1);
2043     multiply_casper(V, U1, t0);
2044     sub_casper(Y3, V, t1);
2045     add_casper(V, Z1, Z2);
2046     square_casper(t1, V);
2047     sub_casper(t1, t1, Z1Z1);
2048     sub_casper(t1, t1, Z2Z2);
2049     multiply_casper(Z3, t1, H);
2050 }
2051 
2052 /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
2053  * where (X1: Y1: Z1) != (X2, Y2)
2054  * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
2055  * Source: 2004 Hankerson?Menezes?Vanstone, page 91.
2056  */
Jac_add_affine(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1,uint32_t * X2,uint32_t * Y2)2057 void Jac_add_affine(
2058     uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2)
2059 {
2060     uint32_t *T1, *T2, *T3, *T4, *T5;
2061     uint32_t *ONE = NULL;
2062     int m1, m2;
2063 
2064     if (N_wordlen == 8U)
2065     {
2066         ONE = NISTr256;
2067     }
2068     if (N_wordlen == 12U)
2069     {
2070         ONE = NISTr384;
2071     }
2072     if (N_wordlen == 18U)
2073     {
2074         ONE = NISTr521;
2075     }
2076 
2077     T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
2078     T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
2079     T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
2080     T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
2081     T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
2082 
2083     CASPER_ECC_equal_to_zero(&m1, Z1);
2084     if (m1 == 0)
2085     {
2086         CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
2087         CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
2088         CASPER_MEMCPY(Z3, ONE, N_wordlen * 4U);
2089         return;
2090     }
2091 
2092     CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
2093     square_casper(T3, Z1);
2094     multiply_casper(T2, T3, Z1);
2095     multiply_casper(T4, T3, X2);
2096     multiply_casper(T3, T2, Y2);
2097 
2098     CASPER_ECC_equal(&m1, T4, X1);
2099     CASPER_ECC_equal(&m2, T3, Y1);
2100     if (m1 == 0)
2101     {
2102         if (m2 == 0)
2103         {
2104             Jac_double(X3, Y3, Z3, X1, Y1, Z1);
2105             return;
2106         }
2107         /* else {
2108           We work with the point at infinity.
2109           The Z-coordinate will be set to zero in this function.
2110         } */
2111     }
2112 
2113     sub_casper(T1, T4, X1);
2114     sub_casper(T2, T3, Y1);
2115     multiply_casper(Z3, T5, T1);
2116     square_casper(T3, T1);
2117     multiply_casper(T4, T3, T1);
2118     multiply_casper(T5, T3, X1);
2119     mul2_casper(T1, T5);
2120     square_casper(X3, T2);
2121     sub_casper(X3, X3, T1);
2122     sub_casper(X3, X3, T4);
2123     sub_casper(T3, T5, X3);
2124     multiply_casper(T1, T3, T2);
2125     multiply_casper(T2, T4, Y1);
2126     sub_casper(Y3, T1, T2);
2127 }
2128 
2129 static uint32_t casper_get_word(uint32_t *addr);
2130 
2131 /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
2132  * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
2133  * (X3 : Y3: Z3) may be the same as the input.
2134  */
Jac_double(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * Z1)2135 void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
2136 {
2137     uint32_t *T1, *T2, *T3, *T4, *T5;
2138 
2139     T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
2140     T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
2141     T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
2142     T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
2143     T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
2144 
2145     square_casper(T1, Z1);
2146     sub_casper(T3, X1, T1);
2147     add_casper(T1, X1, T1);
2148     multiply_casper(T4, T3, T1);
2149 
2150     mul2_casper(T3, T4);
2151 
2152     add_casper(T2, T3, T4);
2153 
2154     mul2_casper(Y3, Y1);
2155 
2156     CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
2157     multiply_casper(Z3, Y3, T5);
2158 
2159     square_casper(T5, Y3);
2160 
2161     multiply_casper(T3, T5, X1);
2162 
2163     square_casper(Y3, T5);
2164 
2165     half(T5, Y3, T4);
2166 
2167     square_casper(X3, T2);
2168 
2169     mul2_casper(T1, T3);
2170 
2171     sub_casper(X3, X3, T1);
2172 
2173     sub_casper(T1, T3, X3);
2174 
2175     multiply_casper(T3, T1, T2);
2176 
2177     sub_casper(Y3, T3, T5);
2178 }
2179 
2180 /* Recoding for a signed fixed window.
2181  * Source: https://eprint.iacr.org/2014/130.pdf, Algorithm 6
2182  * Recode the n-bit integer k into ciel(log2(n)/(w-1)) digits
2183  * where each digit is in
2184  * { +/- 1, +/- 3, ..., +/- 2^(w-1)-1 }
2185  * and put the result in c.
2186  */
recode(int8_t * c,uint32_t * k,int n,int w)2187 static void recode(int8_t *c, uint32_t *k, int n, int w)
2188 {
2189     int i, t;
2190     uint32_t K[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
2191     (void)memcpy(K, k, (size_t)ceil(((double)n / 8.)));
2192     t = (n + (w - 2)) / (w - 1);
2193     for (i = 0; i < t; i++)
2194     {
2195         c[i] = (int8_t)(uint8_t)((K[0] & ((uint32_t)(uint32_t)(1UL << (uint32_t)w) - 1UL)) -
2196                                  (uint32_t)(uint32_t)(1UL << ((uint32_t)w - 1UL)));
2197         shiftrightSysram(K, K, (unsigned)w - 1U);
2198         (void)add_n_1(K, K, (uint32_t)c[i] >> 31, (int16_t)(uint16_t)N_wordlen);
2199     }
2200     c[t] = (int8_t)K[0];
2201 }
2202 
sub_n(uint32_t * c,uint32_t * a,uint32_t * b,int n)2203 static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n)
2204 {
2205     int i;
2206     uint32_t borrow;
2207     sub_borrowout(borrow, GET_WORD(&c[0]), a[0], GET_WORD(&b[0]));
2208     for (i = 1; i < n; i++)
2209     {
2210         sub_borrowin_borrowout(borrow, GET_WORD(&c[i]), a[i], GET_WORD(&b[i]), borrow);
2211     }
2212     return borrow;
2213 }
2214 
2215 #if 0
2216 /* Dumb n-limb subtraction of c=a-b, return borrow. */
2217 static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n) {
2218   int i;
2219   uint32_t borrow;
2220   sub_borrowout(borrow, c[0], a[0], b);
2221   for (i = 1; i < n; i++) {
2222     sub_borrowin_borrowout_1(borrow, c[i], a[i], borrow);
2223   }
2224   return borrow;
2225 }
2226 
2227 /* Dumb n-limb addition of c=a+b, return carry. */
2228 static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n) {
2229   int i;
2230   uint32_t carry;
2231   add_cout(carry, c[0], a[0], b[0]);
2232   for (i = 1; i < n; i++) {
2233     add_cout_cin(carry, c[i], a[i], b[i], carry);
2234   }
2235   return carry;
2236 }
2237 #endif
2238 
2239 /* Dumb n-limb addition of c=a+b, return carry. */
add_n_1(uint32_t * c,uint32_t * a,uint32_t b,int n)2240 static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n)
2241 {
2242     int i;
2243     uint32_t carry;
2244     add_cout(carry, c[0], a[0], b);
2245     for (i = 1; i < n; i++)
2246     {
2247         add_cout_cin(carry, c[i], a[i], 0U, carry);
2248     }
2249     return carry;
2250 }
2251 
int8abs(int8_t v)2252 static uint8_t int8abs(int8_t v)
2253 {
2254     return ((v < 0) ? ((uint8_t)-v) : ((uint8_t)v));
2255 }
2256 
2257 /* Constant time elliptic curve scalar multiplication.
2258  * Source: https://eprint.iacr.org/2014/130.pdf
2259  * when using w = 4.
2260  * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
2261  * p is the prime used to define the finite field F_p
2262  * q is the (prime) order of the curve
2263  */
Jac_scalar_multiplication(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * k,uint32_t * p,uint32_t * q)2264 void Jac_scalar_multiplication(
2265     uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q)
2266 {
2267     uint32_t *scalar, *M, *X, *Y, *Z, *mem_loc;
2268     uint32_t *ONE = NULL;
2269     int i, sign, odd;
2270     uint8_t index;
2271     size_t recodeLength                  = 175u;
2272     size_t bitlen                        = 0u;
2273     int8_t rec[CASPER_RECODE_LENGTH_MAX] = {0};
2274 
2275     if (N_wordlen == 8U)
2276     {
2277         recodeLength = (size_t)kCASPER_ECC_P256_recode_len;
2278         bitlen       = (size_t)kCASPER_ECC_P256_N_bitlen;
2279         ONE          = NISTr256;
2280     }
2281 
2282     if (N_wordlen == 12U)
2283     {
2284         recodeLength = (size_t)kCASPER_ECC_P384_recode_len;
2285         bitlen       = (size_t)kCASPER_ECC_P384_N_bitlen;
2286         ONE          = NISTr384;
2287     }
2288 
2289     if (N_wordlen == 18U)
2290     {
2291         recodeLength = (size_t)kCASPER_ECC_P521_recode_len;
2292         bitlen       = (size_t)521U;
2293         ONE          = NISTr521;
2294     }
2295 
2296     /* Point to the start of the LUT table space. */
2297     mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2298 
2299     scalar = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * (N_wordlen + 4U)];
2300     X      = &CASPER_MEM[(20U * N_wordlen + 80U) + 13U * (N_wordlen + 4U)];
2301     Y      = &CASPER_MEM[(20U * N_wordlen + 80U) + 14U * (N_wordlen + 4U)];
2302     Z      = &CASPER_MEM[(20U * N_wordlen + 80U) + 15U * (N_wordlen + 4U)];
2303     M      = &CASPER_MEM[(20U * N_wordlen + 80U) + 16U * (N_wordlen + 4U)];
2304 
2305     /* Point to memory the recoded scalar.
2306      */
2307     CASPER_MEMCPY(scalar, k, sizeof(uint32_t) * N_wordlen);
2308 
2309 /* Precomputation: compute 1*P, 3*P, 5*P, and 7*P */
2310 #define FSL_CASPER_LUT(P, x) (mem_loc + (3U * ((P)-1U) / 2U + (x)) * (N_wordlen + 4U))
2311 
2312     /* Set 1*P */
2313     CASPER_MEMCPY(Z3, ONE, N_wordlen * sizeof(uint32_t));
2314     CASPER_MEMCPY(FSL_CASPER_LUT(1U, 0U), X1, N_wordlen * sizeof(uint32_t));
2315     CASPER_MEMCPY(FSL_CASPER_LUT(1U, 1U), Y1, N_wordlen * sizeof(uint32_t));
2316     CASPER_MEMCPY(FSL_CASPER_LUT(1U, 2U), Z3, N_wordlen * sizeof(uint32_t));
2317 
2318     /* Compute 2*P */
2319     Jac_double(X3, Y3, Z3, X1, Y1, Z3);
2320 
2321     /* Compute 3*P = 2P + P */
2322     Jac_add_affine(FSL_CASPER_LUT(3U, 0U), FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3, X1, Y1);
2323 
2324     /* Compute 5*P = 3P + 2P */
2325     Jac_addition(FSL_CASPER_LUT(5U, 0U), FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), FSL_CASPER_LUT(3U, 0U),
2326                  FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3);
2327 
2328     /* Compute 7*P = 5P + 2P */
2329     Jac_addition(FSL_CASPER_LUT(7U, 0U), FSL_CASPER_LUT(7U, 1U), FSL_CASPER_LUT(7U, 2U), FSL_CASPER_LUT(5U, 0U),
2330                  FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), X3, Y3, Z3);
2331 
2332     /* Recode the scalar */
2333     odd = (int32_t)((uint32_t)(casper_get_word(&scalar[0]) & 1U));
2334     (void)sub_n(M, q, scalar, (int16_t)(uint16_t)N_wordlen); // todo!!!
2335     casper_select(scalar, M, scalar, odd, (int16_t)(uint16_t)N_wordlen);
2336 
2337     /* Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits */
2338     uint32_t scalarSysram[CASPER_MAX_ECC_SIZE_WORDLEN];
2339     CASPER_MEMCPY(scalarSysram, scalar, /*CASPER_*/ N_wordlen * sizeof(uint32_t));
2340     recode(rec, scalarSysram, (int32_t)bitlen, 4);
2341 
2342     /* Set the first value. */
2343     index = int8abs(rec[recodeLength - 1U]);
2344     sign  = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[recodeLength - 1U]) >> 7);
2345 
2346     CASPER_MEMCPY(X3, FSL_CASPER_LUT((uint32_t)index, 0U), N_wordlen * sizeof(uint32_t));
2347     CASPER_MEMCPY(Y3, FSL_CASPER_LUT((uint32_t)index, 1U), N_wordlen * sizeof(uint32_t));
2348     CASPER_MEMCPY(Z3, FSL_CASPER_LUT((uint32_t)index, 2U), N_wordlen * sizeof(uint32_t));
2349 
2350     /* Get the correct LUT element in constant time by touching
2351      * all elements and masking out the correct one.
2352      */
2353 
2354 #define GET_LUT(x, y, z, index)                                                           \
2355     do                                                                                    \
2356     {                                                                                     \
2357         int m;                                                                            \
2358         CASPER_MEMCPY((x), FSL_CASPER_LUT(1U, 0U), N_wordlen * sizeof(uint32_t));         \
2359         CASPER_MEMCPY((y), FSL_CASPER_LUT(1U, 1U), N_wordlen * sizeof(uint32_t));         \
2360         CASPER_MEMCPY((z), FSL_CASPER_LUT(1U, 2U), N_wordlen * sizeof(uint32_t));         \
2361         m = (int32_t)((index) == 3U);                                                     \
2362         casper_select((x), (x), FSL_CASPER_LUT(3U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2363         casper_select((y), (y), FSL_CASPER_LUT(3U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2364         casper_select((z), (z), FSL_CASPER_LUT(3U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2365         m = (int32_t)((index) == 5U);                                                     \
2366         casper_select((x), (x), FSL_CASPER_LUT(5U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2367         casper_select((y), (y), FSL_CASPER_LUT(5U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2368         casper_select((z), (z), FSL_CASPER_LUT(5U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2369         m = (int32_t)((index) == 7U);                                                     \
2370         casper_select((x), (x), FSL_CASPER_LUT(7U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
2371         casper_select((y), (y), FSL_CASPER_LUT(7U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
2372         casper_select((z), (z), FSL_CASPER_LUT(7U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
2373     } while (false)
2374 
2375     GET_LUT(X3, Y3, Z3, index);
2376 
2377     /* Compute -y and select the positive or negative point. */
2378     (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
2379     casper_select(Y3, Y3, M, sign, (int16_t)(uint16_t)N_wordlen);
2380 
2381     for (i = (int)(uint32_t)(recodeLength - 2U); i >= 0; i--)
2382     {
2383         Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2384         Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2385         Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2386 
2387         index = int8abs(rec[i]);
2388         sign  = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[i]) >> 7);
2389 
2390         GET_LUT(X, Y, Z, index);
2391 
2392         /* Compute -y and select the positive or negative point. */
2393         (void)sub_n(scalar, p, Y, (int16_t)(uint16_t)N_wordlen); // todo!!!
2394         casper_select(scalar, Y, scalar, sign, (int16_t)(uint16_t)N_wordlen);
2395 
2396         Jac_addition(X3, Y3, Z3, X3, Y3, Z3, X, scalar, Z);
2397     }
2398 
2399     (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
2400 
2401     casper_select(Y3, M, Y3, odd, (int16_t)(uint16_t)N_wordlen);
2402 }
2403 
2404 #undef FSL_CASPER_LUT
2405 #undef GET_LUT
2406 
2407 /*
2408  * Pre-compute the following 16 points:
2409  * 00 00 = 0*P + 0*Q  <-- Not needed when using sliding windows
2410  * 00 01 = 0*P + 1*Q  <-- Not needed when using sliding windows
2411  * 00 10 = 0*P + 2*Q
2412  * 00 11 = 0*P + 3*Q
2413  *
2414  * 01 00 = 1*P + 0*Q  <-- Not needed when using sliding windows
2415  * 01 01 = 1*P + 1*Q  <-- Not needed when using sliding windows
2416  * 01 10 = 1*P + 2*Q
2417  * 01 11 = 1*P + 3*Q
2418  *
2419  * 10 00 = 2*P + 0*Q
2420  * 10 01 = 2*P + 1*Q
2421  * 10 10 = 2*P + 2*Q
2422  * 10 11 = 2*P + 3*Q
2423  *
2424  * 11 00 = 3*P + 0*Q
2425  * 11 01 = 3*P + 1*Q
2426  * 11 10 = 3*P + 2*Q
2427  * 11 11 = 3*P + 3*Q
2428  *
2429  * index = (bitsi||bitsj)-2 - (biti != 0)*2
2430  *
2431  * Input:   P = (X1 : Y1 : Z1) and
2432  *          Q = (X2 : Y2 : Z2)
2433  * Output: mem_loc, memory location for the LUT.
2434  */
2435 
precompute_double_scalar_LUT16(uint32_t * Px,uint32_t * Py,uint32_t * Qx,uint32_t * Qy)2436 static void precompute_double_scalar_LUT16(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
2437 {
2438     uint32_t *Q2x, *Q2y, *Q2z, *P2x, *P2y, *P2z, *Z, *mem_loc;
2439     uint32_t *ONE  = NULL;
2440     uint32_t index = 0;
2441 
2442     if (N_wordlen == 8U)
2443     {
2444         ONE = NISTr256;
2445     }
2446 
2447     if (N_wordlen == 12U)
2448     {
2449         ONE = NISTr384;
2450     }
2451 
2452     Q2x = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 0U * (N_wordlen + 4U)];
2453     Q2y = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
2454     Q2z = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
2455 
2456     /* Re-use memory from different scratch space since no
2457      * projective point addition is used below. */
2458     P2x = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
2459     P2z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
2460     P2y = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
2461     Z   = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
2462 
2463     mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2464 
2465     CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
2466 
2467     // 00 10 = 0*P + 2*Q
2468     Jac_double(Q2x, Q2y, Q2z, Qx, Qy, Z);
2469     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2470     index += N_wordlen;
2471     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2472     index += N_wordlen;
2473     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2474     index += N_wordlen;
2475 
2476     // 00 11 = 0*P + 3*Q
2477     Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Qx, Qy);
2478     CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2479     index += N_wordlen;
2480     CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2481     index += N_wordlen;
2482     CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2483     index += N_wordlen;
2484 
2485     // 01 10 = 1*P + 2*Q
2486     Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Px, Py);
2487     CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2488     index += N_wordlen;
2489     CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2490     index += N_wordlen;
2491     CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2492     index += N_wordlen;
2493 
2494     // 01 11 = 1*P + 3*Q
2495     Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Qx, Qy);
2496     CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2497     index += N_wordlen;
2498     CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2499     index += N_wordlen;
2500     CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2501     index += N_wordlen;
2502 
2503     // 10 00 = 2*P + 0*Q
2504     Jac_double(P2x, P2y, P2z, Px, Py, Z);
2505     CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2506     index += N_wordlen;
2507     CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2508     index += N_wordlen;
2509     CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2510     index += N_wordlen;
2511 
2512     // 10 01 = 2*P + 1*Q
2513     Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
2514     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2515     index += N_wordlen;
2516     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2517     index += N_wordlen;
2518     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2519     index += N_wordlen;
2520 
2521     // 10 10 = 2*P + 2*Q
2522     Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2523     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2524     index += N_wordlen;
2525     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2526     index += N_wordlen;
2527     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2528     index += N_wordlen;
2529 
2530     // 10 11 = 2*P + 3*Q
2531     Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2532     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2533     index += N_wordlen;
2534     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2535     index += N_wordlen;
2536     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2537     index += N_wordlen;
2538 
2539     // 11 00 = 3*P + 0*Q
2540     Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Px, Py);
2541     CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
2542     index += N_wordlen;
2543     CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
2544     index += N_wordlen;
2545     CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
2546     index += N_wordlen;
2547 
2548     // 11 01 = 3*P + 1*Q
2549     Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
2550     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2551     index += N_wordlen;
2552     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2553     index += N_wordlen;
2554     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2555     index += N_wordlen;
2556 
2557     // 11 10 = 3*P + 2*Q
2558     Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2559     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2560     index += N_wordlen;
2561     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2562     index += N_wordlen;
2563     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2564     index += N_wordlen;
2565 
2566     // 11 11 = 3*P + 3*Q
2567     Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
2568     CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
2569     index += N_wordlen;
2570     CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
2571     index += N_wordlen;
2572     CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
2573     index += N_wordlen;
2574 }
2575 
2576 /*
2577  * Pre-compute the following 4 points:
2578  * 0 0 = 0*P + 0*Q  <-- Not needed when using sliding windows
2579  * 0 1 = 0*P + 1*Q
2580  *
2581  * 1 0 = 1*P + 0*Q
2582  * 1 1 = 1*P + 1*Q
2583  *
2584  * index = (bitsj+1) & (0-bitsi)
2585  *
2586  * Input:   P = (X1 : Y1 : Z1) and
2587  *          Q = (X2 : Y2 : Z2)
2588  * Output: mem_loc, memory location for the LUT.
2589  */
2590 
precompute_double_scalar_LUT4(uint32_t * Px,uint32_t * Py,uint32_t * Qx,uint32_t * Qy)2591 static void precompute_double_scalar_LUT4(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
2592 {
2593     uint32_t *Z, *mem_loc, *ONE;
2594     uint32_t index = 0;
2595 
2596     ONE = NISTr521;
2597 
2598     /* Re-use memory from different scratch space since no
2599      * projective point addition is used below. */
2600     Z       = &CASPER_MEM[(11U * N_wordlen + 4U) + 5U * (N_wordlen + 4U)];
2601     mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
2602 
2603     CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
2604 
2605     // 0*P + 1*Q
2606     CASPER_MEMCPY(&mem_loc[index], Qx, N_wordlen * sizeof(uint32_t));
2607     index += N_wordlen;
2608     CASPER_MEMCPY(&mem_loc[index], Qy, N_wordlen * sizeof(uint32_t));
2609     index += N_wordlen;
2610     CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
2611     index += N_wordlen;
2612 
2613     // 1*P + 0*Q
2614     CASPER_MEMCPY(&mem_loc[index], Px, N_wordlen * sizeof(uint32_t));
2615     index += N_wordlen;
2616     CASPER_MEMCPY(&mem_loc[index], Py, N_wordlen * sizeof(uint32_t));
2617     index += N_wordlen;
2618     CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
2619     index += N_wordlen;
2620 
2621     // 1*P + 1*Q
2622     Jac_add_affine(&mem_loc[index], &mem_loc[index + N_wordlen], &mem_loc[index + 2U * N_wordlen], Px, Py, Z, Qx, Qy);
2623 }
2624 
2625 #define GETLUTX(x) (3U * (x)*N_wordlen)
2626 #define GETLUTY(x) (3U * (x)*N_wordlen + 1U * N_wordlen)
2627 #define GETLUTZ(x) (3U * (x)*N_wordlen + 2U * N_wordlen)
2628 
2629 /* Compute the double scalar multiplication
2630  * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
2631  * Using Shamir's trick and precomputing 16 points.
2632  * This code is *not* constant time since this is used
2633  * for verification only.
2634  */
double_scalar_multiplication(uint32_t * X3,uint32_t * Y3,uint32_t * Z3,uint32_t * X1,uint32_t * Y1,uint32_t * k1,uint32_t * X2,uint32_t * Y2,uint32_t * k2)2635 void double_scalar_multiplication(uint32_t *X3,
2636                                   uint32_t *Y3,
2637                                   uint32_t *Z3,
2638                                   uint32_t *X1,
2639                                   uint32_t *Y1,
2640                                   uint32_t *k1,
2641                                   uint32_t *X2,
2642                                   uint32_t *Y2,
2643                                   uint32_t *k2)
2644 {
2645     uint32_t index = 0, c = 0;
2646     uint32_t *p1 = NULL, *p2 = NULL, x1, x2, *lut, *Tx = NULL, *Ty = NULL, *Tz = NULL;
2647     size_t bitlen, shiftr, shiftl = 0u;
2648 
2649     if (N_wordlen == 8U)
2650     {
2651         bitlen = (size_t)kCASPER_ECC_P256_N_bitlen;
2652         precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
2653         shiftr = 30U;
2654         shiftl = 2U;
2655     }
2656 
2657     if (N_wordlen == 12U)
2658     {
2659         bitlen = (size_t)kCASPER_ECC_P384_N_bitlen;
2660         precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
2661         shiftr = 30U;
2662         shiftl = 2U;
2663     }
2664 
2665     if (N_wordlen == 18U)
2666     {
2667         bitlen = (size_t)kCASPER_ECC_P521_N_bitlen;
2668         precompute_double_scalar_LUT4(X1, Y1, X2, Y2);
2669         shiftr = 31U;
2670         shiftl = 1U;
2671     }
2672 
2673     lut = &CASPER_MEM[(20U * N_wordlen + 80U)];
2674 
2675     if (N_wordlen == 8U || N_wordlen == 12U)
2676     {
2677         p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen];
2678         p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
2679 
2680         Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
2681         Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 3U * (N_wordlen + 4U)];
2682         Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 4U * (N_wordlen + 4U)];
2683     }
2684 
2685     if (N_wordlen == 18U)
2686     {
2687         p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen];
2688         p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 1U * (N_wordlen + 4U)];
2689 
2690         Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 2U * (N_wordlen + 4U)];
2691         Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 3U * (N_wordlen + 4U)];
2692         Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 4U * (N_wordlen + 4U)];
2693     }
2694 
2695     CASPER_MEMCPY(p1, k1, sizeof(uint32_t) * N_wordlen);
2696     CASPER_MEMCPY(p2, k2, sizeof(uint32_t) * N_wordlen);
2697 
2698     /* Check if we can slide. */
2699     while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U && c < bitlen)
2700     {
2701         shiftleft(p1, p1, 1U);
2702         shiftleft(p2, p2, 1U);
2703         c++;
2704         /* No doubling needed. */
2705     }
2706 
2707     /* Set the first value. */
2708     x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
2709     x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
2710     if (N_wordlen == 8U || N_wordlen == 12U)
2711     {
2712         index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
2713     }
2714 
2715     if (N_wordlen == 18U)
2716     {
2717         index = (((x2) + 1U) & (0U - (x1)));
2718     }
2719     shiftleft(p1, p1, shiftl);
2720     shiftleft(p2, p2, shiftl);
2721 
2722     CASPER_MEMCPY(X3, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
2723     CASPER_MEMCPY(Y3, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
2724     CASPER_MEMCPY(Z3, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
2725     c += shiftl;
2726 
2727     // todo: create an is_zero function
2728 
2729     while ((casper_get_word(&p1[0]) | casper_get_word(&p1[1]) | casper_get_word(&p1[2]) | casper_get_word(&p1[3]) |
2730             casper_get_word(&p1[4]) | casper_get_word(&p1[5]) | casper_get_word(&p1[6]) | casper_get_word(&p1[7]) |
2731             casper_get_word(&p1[8]) | casper_get_word(&p1[9]) | casper_get_word(&p1[10]) | casper_get_word(&p1[11]) |
2732             casper_get_word(&p1[12]) | casper_get_word(&p1[13]) | casper_get_word(&p1[14]) | casper_get_word(&p1[15]) |
2733             casper_get_word(&p1[16]) | casper_get_word(&p1[17]) | casper_get_word(&p2[0]) | casper_get_word(&p2[1]) |
2734             casper_get_word(&p2[2]) | casper_get_word(&p2[3]) | casper_get_word(&p2[4]) | casper_get_word(&p2[5]) |
2735             casper_get_word(&p2[6]) | casper_get_word(&p2[7]) | casper_get_word(&p2[8]) | casper_get_word(&p2[9]) |
2736             casper_get_word(&p2[10]) | casper_get_word(&p2[11]) | casper_get_word(&p2[12]) | casper_get_word(&p2[13]) |
2737             casper_get_word(&p2[14]) | casper_get_word(&p2[15]) | casper_get_word(&p2[16]) |
2738             casper_get_word(&p2[17])) != 0U)
2739     {
2740         /* Check if we can slide. */
2741         while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U &&
2742                c < bitlen)
2743         {
2744             shiftleft(p1, p1, 1U);
2745             shiftleft(p2, p2, 1U);
2746             Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2747             c++;
2748         }
2749 
2750         if (c >= (bitlen - 1U))
2751         {
2752             break;
2753         }
2754 
2755         for (uint32_t i = 0; i < shiftl; i++)
2756         {
2757             Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2758         }
2759 
2760         x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
2761         x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
2762 
2763         if (N_wordlen == 8U || N_wordlen == 12U)
2764         {
2765             index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
2766         }
2767 
2768         if (N_wordlen == 18U)
2769         {
2770             index = (((x2) + 1U) & (0U - (x1)));
2771         }
2772 
2773         shiftleft(p1, p1, shiftl);
2774         shiftleft(p2, p2, shiftl);
2775 
2776         CASPER_MEMCPY(Tx, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
2777         CASPER_MEMCPY(Ty, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
2778         CASPER_MEMCPY(Tz, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
2779 
2780         Jac_addition(X3, Y3, Z3, X3, Y3, Z3, Tx, Ty,
2781                      Tz); //&lut[GETLUTX(index)], &lut[GETLUTY(index)], &lut[GETLUTZ(index)]);
2782         c += shiftl;
2783     }
2784 
2785     /* Special case in the end. */
2786     if (c == (bitlen - 1U))
2787     {
2788         Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2789         x1 = casper_get_word(&p1[N_wordlen - 1U]) >> 31;
2790         x2 = casper_get_word(&p2[N_wordlen - 1U]) >> 31;
2791         if (0U != x1)
2792         {
2793             Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X1, Y1);
2794         }
2795         if (x2 != 0U)
2796         {
2797             Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X2, Y2);
2798         }
2799         c++;
2800     }
2801 
2802     while (c < bitlen)
2803     {
2804         Jac_double(X3, Y3, Z3, X3, Y3, Z3);
2805         c++;
2806     }
2807 }
2808 
invert_mod_p256(uint32_t * c,uint32_t * a)2809 static void invert_mod_p256(uint32_t *c, uint32_t *a)
2810 {
2811     int i;
2812     uint32_t *t, *t2, *s1, *s2, *s4, *s8, *tmp;
2813 
2814     /* Assuming it is safe to use the ECC scratch size. */
2815     t   = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2816                      (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2817                     2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2818     t2  = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2819                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2820                      3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2821     s1  = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2822                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2823                      4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2824     s2  = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2825                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2826                      5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2827     s4  = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2828                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2829                      6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2830     s8  = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2831                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2832                      7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2833     tmp = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2834                        (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2835                       8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2836 
2837     // t2 = n^(2^1)*n  # 11
2838     square_casper(tmp, a);
2839     multiply_casper(t2, tmp, a);
2840 
2841     // s1 = t2^(2^2)*t2  # F
2842     square_casper(s1, t2);
2843     square_casper(tmp, s1);
2844     multiply_casper(s1, tmp, t2);
2845 
2846     // s2 = s1^(2^4)*s1  # FF
2847     square_casper(s2, s1);
2848     // for (i = 1; i < 4; i++) square(s2, s2);
2849     square_casper(tmp, s2);
2850     square_casper(s2, tmp);
2851     square_casper(tmp, s2);
2852     multiply_casper(s2, tmp, s1);
2853 
2854     // s4 = s2^(2^8)*s2  # FFFF
2855     square_casper(s4, s2);
2856     for (i = 1; i < 7; i += 2)
2857     {
2858         square_casper(tmp, s4);
2859         square_casper(s4, tmp);
2860     }
2861     square_casper(tmp, s4);
2862     multiply_casper(s4, tmp, s2);
2863 
2864     // s8 = s4^(2^16)*s4  # FFFFFFFF
2865     square_casper(s8, s4);
2866     for (i = 1; i < 15; i += 2)
2867     {
2868         square_casper(tmp, s8);
2869         square_casper(s8, tmp);
2870     }
2871     square_casper(tmp, s8);
2872     multiply_casper(s8, tmp, s4);
2873 
2874     // t = s8^(2^32)*n  # ffffffff00000001
2875     square_casper(tmp, s8);
2876     for (i = 1; i < 31; i += 2)
2877     {
2878         square_casper(t, tmp);
2879         square_casper(tmp, t);
2880     }
2881     square_casper(t, tmp);
2882     multiply_casper(tmp, t, a);
2883 
2884     // t = t^(2^128)*s8 # ffffffff00000001000000000000000000000000ffffffff
2885     for (i = 0; i < 128; i += 2)
2886     {
2887         square_casper(t, tmp);
2888         square_casper(tmp, t);
2889     }
2890     multiply_casper(t, tmp, s8);
2891 
2892     // t = t^(2^32)*s8  # ffffffff00000001000000000000000000000000ffffffffffffffff
2893     for (i = 0; i < 32; i += 2)
2894     {
2895         square_casper(tmp, t);
2896         square_casper(t, tmp);
2897     }
2898     multiply_casper(tmp, t, s8);
2899 
2900     // t = t^(2^16)*s4  # ffffffff00000001000000000000000000000000ffffffffffffffffffff
2901     for (i = 0; i < 16; i += 2)
2902     {
2903         square_casper(t, tmp);
2904         square_casper(tmp, t);
2905     }
2906     multiply_casper(t, tmp, s4);
2907 
2908     // t = t^(2^8)*s2   # ffffffff00000001000000000000000000000000ffffffffffffffffffffff
2909     for (i = 0; i < 8; i += 2)
2910     {
2911         square_casper(tmp, t);
2912         square_casper(t, tmp);
2913     }
2914     multiply_casper(tmp, t, s2);
2915 
2916     // t = t^(2^4)*s1   # ffffffff00000001000000000000000000000000fffffffffffffffffffffff
2917     for (i = 0; i < 4; i += 2)
2918     {
2919         square_casper(t, tmp);
2920         square_casper(tmp, t);
2921     }
2922     multiply_casper(t, tmp, s1);
2923 
2924     // t = t^(2^2)*t2
2925     square_casper(tmp, t);
2926     square_casper(t, tmp);
2927     multiply_casper(tmp, t, t2);
2928 
2929     // t = t^(2^2)*n    # ffffffff00000001000000000000000000000000fffffffffffffffffffffffd
2930     square_casper(t, tmp);
2931     square_casper(tmp, t);
2932     multiply_casper(c, tmp, a);
2933 }
2934 
2935 // A and C do not need to be in Casper memory
toMontgomery_ECC_P256(uint32_t * C,uint32_t * A)2936 static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A)
2937 {
2938     /* R^2 = 2^512 mod p, used to convert values to Montgomery form. */
2939     uint32_t R2[kCASPER_ECC_P256_wordlen] = {0x00000003,  0x00000000,  0xffffffffU, 0xfffffffbU,
2940                                              0xfffffffeU, 0xffffffffU, 0xfffffffdU, 0x4};
2941     uint32_t *T1, *T2, *T3;
2942     T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2943                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2944                      0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2945     T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2946                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2947                      1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2948     T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
2949                       (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
2950                      2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
2951 
2952     CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2953     CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2954 
2955     multiply_casper(T3, T2, T1);
2956     CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
2957 }
2958 
2959 /* Compute inversion modulo NIST-p384 using Fermats little theorem.
2960  * Using c = a^(p-2) = a^(-1) mod p.
2961  * This computes the modular inversion if all arithmetic is "regular"
2962  * modular arithmetic or computes automatically the Montgomery inverse
2963  * if all arithmetic is Montgomery arithmetic.
2964  */
2965 
invert_mod_p384(uint32_t * c,uint32_t * a)2966 static void invert_mod_p384(uint32_t *c, uint32_t *a)
2967 {
2968     int i;
2969     uint32_t *e, *d, *tmp, *t0, *t1, *t2, *t3, *t4, *t5, *t6; // 10 residues needed
2970 
2971     /* Assuming it is safe to use the LUT scratch size.
2972      * Hence, do not invert while elements in the LUT are needed.
2973      */
2974     e = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2975     d = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2976     tmp =
2977         &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2978     t0 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2979     t1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2980     t2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2981     t3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2982     t4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2983     t5 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2984     t6 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
2985 
2986     square_casper(tmp, a);        // 2
2987     square_casper(t1, tmp);       // 4
2988     square_casper(tmp, t1);       // 8
2989     multiply_casper(t2, tmp, t1); // 12
2990     multiply_casper(d, a, t2);    // 13
2991     multiply_casper(e, d, a);     // 14
2992     multiply_casper(t0, e, a);    // 15
2993 
2994     // t1 = t0^(2^4)*t0     # ff
2995     square_casper(tmp, t0);
2996     square_casper(t1, tmp);
2997     square_casper(tmp, t1);
2998     square_casper(t2, tmp);
2999     multiply_casper(t1, t2, t0);
3000 
3001     // t2 = t1^(2^8)*t1   # 4f
3002     square_casper(tmp, t1);
3003     for (i = 0; i < 3; i++)
3004     {
3005         square_casper(t3, tmp);
3006         square_casper(tmp, t3);
3007     }
3008     square_casper(t3, tmp);
3009     multiply_casper(t2, t3, t1);
3010 
3011     // t3 = t2^(2^16)*t2  # 8f
3012     square_casper(tmp, t2);
3013     for (i = 0; i < 7; i++)
3014     {
3015         square_casper(t4, tmp);
3016         square_casper(tmp, t4);
3017     }
3018     square_casper(t4, tmp);
3019     multiply_casper(t3, t4, t2);
3020 
3021     // t4 = t3^(2^32)*t3  # 16f
3022     square_casper(tmp, t3);
3023     for (i = 0; i < 15; i++)
3024     {
3025         square_casper(t5, tmp);
3026         square_casper(tmp, t5);
3027     }
3028     square_casper(t5, tmp);
3029     multiply_casper(t4, t5, t3);
3030 
3031     // t5 = t4^(2^64)*t4  # 32f
3032     square_casper(tmp, t4);
3033     for (i = 0; i < 31; i++)
3034     {
3035         square_casper(t6, tmp);
3036         square_casper(tmp, t6);
3037     }
3038     square_casper(t6, tmp);
3039     multiply_casper(t5, t6, t4);
3040 
3041     // t5 = t5^(2^64)*t4  # 48f
3042     square_casper(tmp, t5);
3043     for (i = 0; i < 31; i++)
3044     {
3045         square_casper(t6, tmp);
3046         square_casper(tmp, t6);
3047     }
3048     square_casper(t6, tmp);
3049     multiply_casper(t5, t6, t4);
3050 
3051     // t5 = t5^(2^32)*t3  # 56f
3052     square_casper(tmp, t5);
3053     for (i = 0; i < 15; i++)
3054     {
3055         square_casper(t6, tmp);
3056         square_casper(tmp, t6);
3057     }
3058     square_casper(t6, tmp);
3059     multiply_casper(t5, t6, t3);
3060 
3061     // t5 = t5^(2^16)*t2  # 60f
3062     square_casper(tmp, t5);
3063     for (i = 0; i < 7; i++)
3064     {
3065         square_casper(t6, tmp);
3066         square_casper(tmp, t6);
3067     }
3068     square_casper(t6, tmp);
3069     multiply_casper(t5, t6, t2);
3070 
3071     // t5 = t5^(2^8)*t1   # 62f
3072     square_casper(tmp, t5);
3073     for (i = 0; i < 3; i++)
3074     {
3075         square_casper(t6, tmp);
3076         square_casper(tmp, t6);
3077     }
3078     square_casper(t6, tmp);
3079     multiply_casper(t5, t6, t1);
3080 
3081     // n = t5^(2^4)*t0     # 63f
3082     square_casper(tmp, t5);
3083     for (i = 0; i < 1; i++)
3084     {
3085         square_casper(t6, tmp);
3086         square_casper(tmp, t6);
3087     }
3088     square_casper(t6, tmp);
3089     multiply_casper(t5, t6, t0);
3090 
3091     // n = n^(2^4)*e
3092     square_casper(tmp, t5);
3093     for (i = 0; i < 1; i++)
3094     {
3095         square_casper(t6, tmp);
3096         square_casper(tmp, t6);
3097     }
3098     square_casper(t6, tmp);
3099     multiply_casper(t5, t6, e);
3100 
3101     // n = n^(2^32)*t3
3102     square_casper(tmp, t5);
3103     for (i = 0; i < 15; i++)
3104     {
3105         square_casper(t6, tmp);
3106         square_casper(tmp, t6);
3107     }
3108     square_casper(t6, tmp);
3109     multiply_casper(t5, t6, t3);
3110 
3111     // n = n^(2^64)
3112     square_casper(tmp, t5);
3113     for (i = 0; i < 31; i++)
3114     {
3115         square_casper(t6, tmp);
3116         square_casper(tmp, t6);
3117     }
3118     square_casper(t5, tmp);
3119 
3120     // n = n^(2^16)*t2
3121     square_casper(tmp, t5);
3122     for (i = 0; i < 7; i++)
3123     {
3124         square_casper(t6, tmp);
3125         square_casper(tmp, t6);
3126     }
3127     square_casper(t6, tmp);
3128     multiply_casper(t5, t6, t2);
3129 
3130     // n = n^(2^8)*t1
3131     square_casper(tmp, t5);
3132     for (i = 0; i < 3; i++)
3133     {
3134         square_casper(t6, tmp);
3135         square_casper(tmp, t6);
3136     }
3137     square_casper(t6, tmp);
3138     multiply_casper(t5, t6, t1);
3139 
3140     // n = n^(2^4)*t0
3141     square_casper(tmp, t5);
3142     for (i = 0; i < 1; i++)
3143     {
3144         square_casper(t6, tmp);
3145         square_casper(tmp, t6);
3146     }
3147     square_casper(t6, tmp);
3148     multiply_casper(t5, t6, t0);
3149 
3150     // n = n^(2^4)*d
3151     square_casper(tmp, t5);
3152     for (i = 0; i < 1; i++)
3153     {
3154         square_casper(t6, tmp);
3155         square_casper(tmp, t6);
3156     }
3157     square_casper(t6, tmp);
3158     multiply_casper(c, t6, d);
3159 }
3160 
3161 // A and C do not need to be in Casper memory
toMontgomery_ECC_P384(uint32_t * C,uint32_t * A)3162 static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A)
3163 {
3164     /* R^2 = 2^768 mod p, used to convert values to Montgomery form. */
3165     uint32_t R2[kCASPER_ECC_P384_wordlen] = {0x00000001, 0xfffffffeU, 0x00000000, 0x00000002, 0x00000000, 0xfffffffeU,
3166                                              0x00000000, 0x00000002,  0x1,        0x0,        0x0,        0x0};
3167     uint32_t *T1, *T2, *T3;
3168     T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3169                       (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3170                      0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3171     T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3172                       (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3173                      1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3174     T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
3175                       (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
3176                      2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
3177 
3178     CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3179     CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3180 
3181     multiply_casper(T3, T2, T1);
3182     CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
3183 }
3184 
invert_mod_p521(uint32_t * c,uint32_t * a)3185 static void invert_mod_p521(uint32_t *c, uint32_t *a)
3186 {
3187     int i;
3188     uint32_t *e3, *d2, *d3, *d4, *T2, *T4; // 6 residues needed
3189 
3190     /* Assuming it is safe to use the LUT scratch size.
3191      * Hence, do not invert while elements in the LUT are needed.
3192      */
3193     e3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3194     d2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3195     d3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3196     d4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3197     T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3198     T4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3199 
3200     square_casper(d2, a);
3201     multiply_casper(T2, d2, a);
3202 
3203     // d3 = 2^2 * T2
3204     square_casper(d3, T2);
3205     square_casper(e3, d3);
3206     multiply_casper(T4, e3, T2);
3207 
3208     // d3 = 2^4 * T4
3209     square_casper(d3, T4);
3210     square_casper(e3, d3);
3211     square_casper(d3, e3);
3212     square_casper(e3, d3);
3213     multiply_casper(d2, e3, T4);
3214 
3215     // d3 = 2^8 * d2
3216     square_casper(d3, d2);
3217     square_casper(e3, d3);
3218     for (i = 0; i < 3; i++)
3219     {
3220         square_casper(d3, e3);
3221         square_casper(e3, d3);
3222     }
3223     multiply_casper(d4, e3, d2);
3224 
3225     // d3 = 2^16 * d2
3226     square_casper(d3, d4);
3227     square_casper(e3, d3);
3228     for (i = 0; i < 7; i++)
3229     {
3230         square_casper(d3, e3);
3231         square_casper(e3, d3);
3232     }
3233     multiply_casper(d2, e3, d4);
3234 
3235     // d3 = 2^32 * d2
3236     square_casper(d3, d2);
3237     square_casper(e3, d3);
3238     for (i = 0; i < 15; i++)
3239     {
3240         square_casper(d3, e3);
3241         square_casper(e3, d3);
3242     }
3243     multiply_casper(d4, e3, d2);
3244 
3245     // d3 = 2^64 * d2
3246     square_casper(d3, d4);
3247     square_casper(e3, d3);
3248     for (i = 0; i < 31; i++)
3249     {
3250         square_casper(d3, e3);
3251         square_casper(e3, d3);
3252     }
3253     multiply_casper(d2, e3, d4);
3254 
3255     // d3 = 2^128 * d2
3256     square_casper(d3, d2);
3257     square_casper(e3, d3);
3258     for (i = 0; i < 63; i++)
3259     {
3260         square_casper(d3, e3);
3261         square_casper(e3, d3);
3262     }
3263     multiply_casper(d4, e3, d2);
3264 
3265     // d3 = 2^256 * d2
3266     square_casper(d3, d4);
3267     square_casper(e3, d3);
3268     for (i = 0; i < 127; i++)
3269     {
3270         square_casper(d3, e3);
3271         square_casper(e3, d3);
3272     }
3273     multiply_casper(d2, e3, d4);
3274 
3275     // d3 = 2^2 * d2
3276     square_casper(d3, d2);
3277     square_casper(e3, d3);
3278     multiply_casper(d2, e3, T2);
3279 
3280     // d3 = 2^4 * d2
3281     square_casper(d3, d2);
3282     square_casper(e3, d3);
3283     square_casper(d3, e3);
3284     square_casper(e3, d3);
3285     multiply_casper(d2, e3, T4);
3286 
3287     square_casper(d3, d2);
3288     multiply_casper(d2, d3, a);
3289 
3290     // d3 = 2 ^ 2 * d2
3291     square_casper(d3, d2);
3292     square_casper(e3, d3);
3293     multiply_casper(c, e3, a);
3294 }
3295 
toMontgomery_ECC_P521(uint32_t * C,uint32_t * A)3296 static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A)
3297 {
3298     /* R^2 = 2^1088 mod p, used to convert values to Montgomery form. */
3299     // uint32_t R2[NUM_LIMBS] = { 0x00000000, 0x4000, 0, 0,
3300     //                           0, 0, 0, 0,
3301     //                          0, 0, 0, 0,
3302     //                          0 };
3303     /* R^2 = 2^1152 mod p, used to convert values to Montgomery form. */
3304     uint32_t R2[kCASPER_ECC_P521_wordlen] = {0, 0, 0, 0x4000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3305 
3306     uint32_t *T1, *T2, *T3;
3307     T1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3308     T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3309     T3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
3310 
3311     CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3312     CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3313 
3314     multiply_casper(T3, T2, T1);
3315     CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
3316 }
3317 
MultprecCiosMul521_ct(uint32_t w_out[],const uint32_t a[],const uint32_t b[],const uint32_t Nmod[],const uint32_t * Np)3318 static void MultprecCiosMul521_ct(
3319     uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
3320 {
3321     uint32_t j;
3322     uint64_t carry;
3323     uint64_t *a64, *b64, *w64;
3324 
3325     uint32_t *T1 = &CASPER_MEM[0], borrow;
3326 
3327     a64 = (uint64_t *)(uintptr_t)a;
3328     b64 = (uint64_t *)(uintptr_t)b;
3329     w64 = (uint64_t *)(uintptr_t)w_out;
3330 
3331     if (a != NULL)
3332     { /*  if !a, we are reducing only */
3333         PreZeroW(j, w_out);
3334     }
3335     SET_DWORD(&w64[N_dwordlen], 0ULL);
3336     SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
3337     /*  with accelerator */
3338 
3339     /*  loop j and then reduce after each j round */
3340     for (j = 0; j < N_dwordlen; j++)
3341     {
3342         /*  Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
3343         /*  push-pull: we do a*b and then separately m*n (reduce) */
3344         if (a != NULL)
3345         { /*  if mul&reduce vs. reduce only */
3346             carry = GET_DWORD(&w64[N_dwordlen]);
3347             Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
3348             Accel_crypto_mul(
3349                 Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
3350             Accel_done();
3351             /*  max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
3352             /*  so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
3353             /*  accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
3354             /*  w64[N_dwordlen+1] = g_carry; */
3355             carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
3356             SET_DWORD(&w64[N_dwordlen + 1U], carry);
3357         }
3358 
3359         /* Fast reduction using only shifts for this special shape:
3360          * (c - (-p^-1*c mod 2^64) * p)/2^64 =
3361          * (c - c_0 * p)/2^64 =
3362          * (\sum_{j=0}^9 c_i*2^64 - c_0 * p)/2^64 =
3363          * (\sum_{j=0}^9 c_i*2^64 - c_0 * (2^521-1))/2^64 =
3364          * (\sum_{j=0}^9 c_i*2^64 - c_0 * 2^521 - c_0)/2^64 =
3365          * c_1 + c_2*2^64 + c_3*2^128 + c_4*2^192 + c_5*2^256 + c_6*2^320 + c_7*2^384 + c_8*2^448 + c_9*2^512 + c_0 *
3366          * 2^{448 + 9} so one only needs to compute this 128-bit addition: [c_8, c_9] + c_0 * 2^9
3367          */
3368 
3369         uint64_t *p64 = (uint64_t *)(uintptr_t)T1;
3370 
3371         /* p64[0] = w64[0] << 9;*/
3372         SET_DWORD(&p64[0], GET_DWORD(&w64[0]) << 9U);
3373         /* p64[1] = w64[0] >> (64 - 9); */
3374         SET_DWORD(&p64[1], GET_DWORD(&w64[0]) >> (64 - 9));
3375         /* w64[0] = w64[1]; */
3376         SET_DWORD(&w64[0], GET_DWORD(&w64[1]));
3377         /* w64[1] = w64[2]; */
3378         SET_DWORD(&w64[1], GET_DWORD(&w64[2]));
3379         /* w64[2] = w64[3]; */
3380         SET_DWORD(&w64[2], GET_DWORD(&w64[3]));
3381         /* w64[3] = w64[4]; */
3382         SET_DWORD(&w64[3], GET_DWORD(&w64[4]));
3383         /* w64[4] = w64[5]; */
3384         SET_DWORD(&w64[4], GET_DWORD(&w64[5]));
3385         /* w64[5] = w64[6]; */
3386         SET_DWORD(&w64[5], GET_DWORD(&w64[6]));
3387         /* w64[6] = w64[7]; */
3388         SET_DWORD(&w64[6], GET_DWORD(&w64[7]));
3389 
3390         /* Compute p64 = p64 + {w64[8], w64[9]} using one additonal double-length limb,
3391          * where p64 = w64[0] * 2^9.
3392          */
3393         Accel_SetABCD_Addr(CA_MK_OFF(&w64[8]), 0);
3394         Accel_crypto_mul(Accel_IterOpcodeResaddr(2, (uint32_t)kCASPER_OpAdd64, /* kCASPER_OpAdd64, */
3395                                                  CA_MK_OFF(p64)));
3396         Accel_done();
3397 
3398         /* w64[7] = p64[0]; */
3399         SET_DWORD(&w64[7], GET_DWORD(&p64[0]));
3400         /* w64[8] = p64[1]; */
3401         SET_DWORD(&w64[8], GET_DWORD(&p64[1]));
3402         /* w64[9] = 0; */
3403         SET_DWORD(&w64[9], (uint64_t)0U);
3404     }
3405 
3406     /* memcpy(T1, w_out, (NUM_LIMBS + 1) * sizeof(uint32_t)); */
3407     /*  now check if need to subtract Nmod */
3408     CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
3409 
3410     /* Compute w = w - N */
3411     Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
3412     Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
3413     Accel_done();
3414 
3415     /* if w_out > T1 then there was a borrow */
3416     /* borrow = (((uint32_t*)w_out)[NUM_LIMBS] > T1[NUM_LIMBS]); */
3417     borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
3418     SET_WORD(&w_out[N_wordlen + 1U], 0);
3419     SET_WORD(&w_out[N_wordlen], 0);
3420     /*  w_out[NUM_LIMBS + 1] = 0; */
3421     /*  w_out[NUM_LIMBS] = 0; */
3422     casper_select(w_out, w_out, T1, (int32_t)borrow, (int32_t)N_wordlen);
3423 }
3424 
3425 #if defined(__GNUC__)
3426 /* End of enforcing O1 optimize level for gcc*/
3427 #pragma GCC pop_options
3428 #endif
3429 
3430 #if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
3431 // End of enforcing optimize off for clang
3432 #pragma clang optimize on
3433 #endif
3434