1 /*
2 * Implementation of curve P-256 (ECDH and ECDSA)
3 *
4 * Copyright The Mbed TLS Contributors
5 * Author: Manuel Pégourié-Gonnard.
6 * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
7 */
8
9 #include "p256-m.h"
10 #include "mbedtls/platform_util.h"
11 #include "psa/crypto.h"
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15
16 #if defined (MBEDTLS_PSA_P256M_DRIVER_ENABLED)
17
18 /*
19 * Zeroize memory - this should not be optimized away
20 */
21 #define zeroize mbedtls_platform_zeroize
22
23 /*
24 * Helpers to test constant-time behaviour with valgrind or MemSan.
25 *
26 * CT_POISON() is used for secret data. It marks the memory area as
27 * uninitialised, so that any branch or pointer dereference that depends on it
28 * (even indirectly) triggers a warning.
29 * CT_UNPOISON() is used for public data; it marks the area as initialised.
30 *
31 * These are macros in order to avoid interfering with origin tracking.
32 */
33 #if defined(CT_MEMSAN)
34
35 #include <sanitizer/msan_interface.h>
36 #define CT_POISON __msan_allocated_memory
37 // void __msan_allocated_memory(const volatile void* data, size_t size);
38 #define CT_UNPOISON __msan_unpoison
39 // void __msan_unpoison(const volatile void *a, size_t size);
40
41 #elif defined(CT_VALGRIND)
42
43 #include <valgrind/memcheck.h>
44 #define CT_POISON VALGRIND_MAKE_MEM_UNDEFINED
45 // VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr,_qzz_len)
46 #define CT_UNPOISON VALGRIND_MAKE_MEM_DEFINED
47 // VALGRIND_MAKE_MEM_DEFINED(_qzz_addr,_qzz_len)
48
49 #else
50 #define CT_POISON(p, sz)
51 #define CT_UNPOISON(p, sz)
52 #endif
53
54 /**********************************************************************
55 *
56 * Operations on fixed-width unsigned integers
57 *
58 * Represented using 32-bit limbs, least significant limb first.
59 * That is: x = x[0] + 2^32 x[1] + ... + 2^224 x[7] for 256-bit.
60 *
61 **********************************************************************/
62
63 /*
64 * 256-bit set to 32-bit value
65 *
66 * in: x in [0, 2^32)
67 * out: z = x
68 */
u256_set32(uint32_t z[8],uint32_t x)69 static void u256_set32(uint32_t z[8], uint32_t x)
70 {
71 z[0] = x;
72 for (unsigned i = 1; i < 8; i++) {
73 z[i] = 0;
74 }
75 }
76
77 /*
78 * 256-bit addition
79 *
80 * in: x, y in [0, 2^256)
81 * out: z = (x + y) mod 2^256
82 * c = (x + y) div 2^256
83 * That is, z + c * 2^256 = x + y
84 *
85 * Note: as a memory area, z must be either equal to x or y, or not overlap.
86 */
u256_add(uint32_t z[8],const uint32_t x[8],const uint32_t y[8])87 static uint32_t u256_add(uint32_t z[8],
88 const uint32_t x[8], const uint32_t y[8])
89 {
90 uint32_t carry = 0;
91
92 for (unsigned i = 0; i < 8; i++) {
93 uint64_t sum = (uint64_t) carry + x[i] + y[i];
94 z[i] = (uint32_t) sum;
95 carry = (uint32_t) (sum >> 32);
96 }
97
98 return carry;
99 }
100
101 /*
102 * 256-bit subtraction
103 *
104 * in: x, y in [0, 2^256)
105 * out: z = (x - y) mod 2^256
106 * c = 0 if x >=y, 1 otherwise
107 * That is, z = c * 2^256 + x - y
108 *
109 * Note: as a memory area, z must be either equal to x or y, or not overlap.
110 */
u256_sub(uint32_t z[8],const uint32_t x[8],const uint32_t y[8])111 static uint32_t u256_sub(uint32_t z[8],
112 const uint32_t x[8], const uint32_t y[8])
113 {
114 uint32_t carry = 0;
115
116 for (unsigned i = 0; i < 8; i++) {
117 uint64_t diff = (uint64_t) x[i] - y[i] - carry;
118 z[i] = (uint32_t) diff;
119 carry = -(uint32_t) (diff >> 32);
120 }
121
122 return carry;
123 }
124
125 /*
126 * 256-bit conditional assignment
127 *
128 * in: x in [0, 2^256)
129 * c in [0, 1]
130 * out: z = x if c == 1, z unchanged otherwise
131 *
132 * Note: as a memory area, z must be either equal to x, or not overlap.
133 */
u256_cmov(uint32_t z[8],const uint32_t x[8],uint32_t c)134 static void u256_cmov(uint32_t z[8], const uint32_t x[8], uint32_t c)
135 {
136 const uint32_t x_mask = -c;
137 for (unsigned i = 0; i < 8; i++) {
138 z[i] = (z[i] & ~x_mask) | (x[i] & x_mask);
139 }
140 }
141
142 /*
143 * 256-bit compare for equality
144 *
145 * in: x in [0, 2^256)
146 * y in [0, 2^256)
147 * out: 0 if x == y, unspecified non-zero otherwise
148 */
u256_diff(const uint32_t x[8],const uint32_t y[8])149 static uint32_t u256_diff(const uint32_t x[8], const uint32_t y[8])
150 {
151 uint32_t diff = 0;
152 for (unsigned i = 0; i < 8; i++) {
153 diff |= x[i] ^ y[i];
154 }
155 return diff;
156 }
157
158 /*
159 * 256-bit compare to zero
160 *
161 * in: x in [0, 2^256)
162 * out: 0 if x == 0, unspecified non-zero otherwise
163 */
u256_diff0(const uint32_t x[8])164 static uint32_t u256_diff0(const uint32_t x[8])
165 {
166 uint32_t diff = 0;
167 for (unsigned i = 0; i < 8; i++) {
168 diff |= x[i];
169 }
170 return diff;
171 }
172
173 /*
174 * 32 x 32 -> 64-bit multiply-and-accumulate
175 *
176 * in: x, y, z, t in [0, 2^32)
177 * out: x * y + z + t in [0, 2^64)
178 *
179 * Note: this computation cannot overflow.
180 *
181 * Note: this function has two pure-C implementations (depending on whether
182 * MUL64_IS_CONSTANT_TIME), and possibly optimised asm implementations.
183 * Start with the potential asm definitions, and use the C definition only if
184 * we no have no asm for the current toolchain & CPU.
185 */
186 static uint64_t u32_muladd64(uint32_t x, uint32_t y, uint32_t z, uint32_t t);
187
188 /* This macro is used to mark whether an asm implentation is found */
189 #undef MULADD64_ASM
190 /* This macro is used to mark whether the implementation has a small
191 * code size (ie, it can be inlined even in an unrolled loop) */
192 #undef MULADD64_SMALL
193
194 /*
195 * Currently assembly optimisations are only supported with GCC/Clang for
196 * Arm's Cortex-A and Cortex-M lines of CPUs, which start with the v6-M and
197 * v7-M architectures. __ARM_ARCH_PROFILE is not defined for v6 and earlier.
198 * Thumb and 32-bit assembly is supported; aarch64 is not supported.
199 */
200 #if defined(__GNUC__) &&\
201 defined(__ARM_ARCH) && __ARM_ARCH >= 6 && defined(__ARM_ARCH_PROFILE) && \
202 ( __ARM_ARCH_PROFILE == 77 || __ARM_ARCH_PROFILE == 65 ) /* 'M' or 'A' */ && \
203 !defined(__aarch64__)
204
205 /*
206 * This set of CPUs is conveniently partitioned as follows:
207 *
208 * 1. Cores that have the DSP extension, which includes a 1-cycle UMAAL
209 * instruction: M4, M7, M33, all A-class cores.
210 * 2. Cores that don't have the DSP extension, and also lack a constant-time
211 * 64-bit multiplication instruction:
212 * - M0, M0+, M23: 32-bit multiplication only;
213 * - M3: 64-bit multiplication is not constant-time.
214 */
215 #if defined(__ARM_FEATURE_DSP)
216
u32_muladd64(uint32_t x,uint32_t y,uint32_t z,uint32_t t)217 static uint64_t u32_muladd64(uint32_t x, uint32_t y, uint32_t z, uint32_t t)
218 {
219 __asm__(
220 /* UMAAL <RdLo>, <RdHi>, <Rn>, <Rm> */
221 "umaal %[z], %[t], %[x], %[y]"
222 : [z] "+l" (z), [t] "+l" (t)
223 : [x] "l" (x), [y] "l" (y)
224 );
225 return ((uint64_t) t << 32) | z;
226 }
227 #define MULADD64_ASM
228 #define MULADD64_SMALL
229
230 #else /* __ARM_FEATURE_DSP */
231
232 /*
233 * This implementation only uses 16x16->32 bit multiplication.
234 *
235 * It decomposes the multiplicands as:
236 * x = xh:xl = 2^16 * xh + xl
237 * y = yh:yl = 2^16 * yh + yl
238 * and computes their product as:
239 * x*y = xl*yl + 2**16 (xh*yl + yl*yh) + 2**32 xh*yh
240 * then adds z and t to the result.
241 */
u32_muladd64(uint32_t x,uint32_t y,uint32_t z,uint32_t t)242 static uint64_t u32_muladd64(uint32_t x, uint32_t y, uint32_t z, uint32_t t)
243 {
244 /* First compute x*y, using 3 temporary registers */
245 uint32_t tmp1, tmp2, tmp3;
246 __asm__(
247 ".syntax unified\n\t"
248 /* start by splitting the inputs into halves */
249 "lsrs %[u], %[x], #16\n\t"
250 "lsrs %[v], %[y], #16\n\t"
251 "uxth %[x], %[x]\n\t"
252 "uxth %[y], %[y]\n\t"
253 /* now we have %[x], %[y], %[u], %[v] = xl, yl, xh, yh */
254 /* let's compute the 4 products we can form with those */
255 "movs %[w], %[v]\n\t"
256 "muls %[w], %[u]\n\t"
257 "muls %[v], %[x]\n\t"
258 "muls %[x], %[y]\n\t"
259 "muls %[y], %[u]\n\t"
260 /* now we have %[x], %[y], %[v], %[w] = xl*yl, xh*yl, xl*yh, xh*yh */
261 /* let's split and add the first middle product */
262 "lsls %[u], %[y], #16\n\t"
263 "lsrs %[y], %[y], #16\n\t"
264 "adds %[x], %[u]\n\t"
265 "adcs %[y], %[w]\n\t"
266 /* let's finish with the second middle product */
267 "lsls %[u], %[v], #16\n\t"
268 "lsrs %[v], %[v], #16\n\t"
269 "adds %[x], %[u]\n\t"
270 "adcs %[y], %[v]\n\t"
271 : [x] "+l" (x), [y] "+l" (y),
272 [u] "=&l" (tmp1), [v] "=&l" (tmp2), [w] "=&l" (tmp3)
273 : /* no read-only inputs */
274 : "cc"
275 );
276 (void) tmp1;
277 (void) tmp2;
278 (void) tmp3;
279
280 /* Add z and t, using one temporary register */
281 __asm__(
282 ".syntax unified\n\t"
283 "movs %[u], #0\n\t"
284 "adds %[x], %[z]\n\t"
285 "adcs %[y], %[u]\n\t"
286 "adds %[x], %[t]\n\t"
287 "adcs %[y], %[u]\n\t"
288 : [x] "+l" (x), [y] "+l" (y), [u] "=&l" (tmp1)
289 : [z] "l" (z), [t] "l" (t)
290 : "cc"
291 );
292 (void) tmp1;
293
294 return ((uint64_t) y << 32) | x;
295 }
296 #define MULADD64_ASM
297
298 #endif /* __ARM_FEATURE_DSP */
299
300 #endif /* GCC/Clang with Cortex-M/A CPU */
301
302 #if !defined(MULADD64_ASM)
303 #if defined(MUL64_IS_CONSTANT_TIME)
u32_muladd64(uint32_t x,uint32_t y,uint32_t z,uint32_t t)304 static uint64_t u32_muladd64(uint32_t x, uint32_t y, uint32_t z, uint32_t t)
305 {
306 return (uint64_t) x * y + z + t;
307 }
308 #define MULADD64_SMALL
309 #else
u32_muladd64(uint32_t x,uint32_t y,uint32_t z,uint32_t t)310 static uint64_t u32_muladd64(uint32_t x, uint32_t y, uint32_t z, uint32_t t)
311 {
312 /* x = xl + 2**16 xh, y = yl + 2**16 yh */
313 const uint16_t xl = (uint16_t) x;
314 const uint16_t yl = (uint16_t) y;
315 const uint16_t xh = x >> 16;
316 const uint16_t yh = y >> 16;
317
318 /* x*y = xl*yl + 2**16 (xh*yl + yl*yh) + 2**32 xh*yh
319 * = lo + 2**16 (m1 + m2 ) + 2**32 hi */
320 const uint32_t lo = (uint32_t) xl * yl;
321 const uint32_t m1 = (uint32_t) xh * yl;
322 const uint32_t m2 = (uint32_t) xl * yh;
323 const uint32_t hi = (uint32_t) xh * yh;
324
325 uint64_t acc = lo + ((uint64_t) (hi + (m1 >> 16) + (m2 >> 16)) << 32);
326 acc += m1 << 16;
327 acc += m2 << 16;
328 acc += z;
329 acc += t;
330
331 return acc;
332 }
333 #endif /* MUL64_IS_CONSTANT_TIME */
334 #endif /* MULADD64_ASM */
335
336 /*
337 * 288 + 32 x 256 -> 288-bit multiply and add
338 *
339 * in: x in [0, 2^32)
340 * y in [0, 2^256)
341 * z in [0, 2^288)
342 * out: z_out = z_in + x * y mod 2^288
343 * c = z_in + x * y div 2^288
344 * That is, z_out + c * 2^288 = z_in + x * y
345 *
346 * Note: as a memory area, z must be either equal to y, or not overlap.
347 *
348 * This is a helper for Montgomery multiplication.
349 */
u288_muladd(uint32_t z[9],uint32_t x,const uint32_t y[8])350 static uint32_t u288_muladd(uint32_t z[9], uint32_t x, const uint32_t y[8])
351 {
352 uint32_t carry = 0;
353
354 #define U288_MULADD_STEP(i) \
355 do { \
356 uint64_t prod = u32_muladd64(x, y[i], z[i], carry); \
357 z[i] = (uint32_t) prod; \
358 carry = (uint32_t) (prod >> 32); \
359 } while( 0 )
360
361 #if defined(MULADD64_SMALL)
362 U288_MULADD_STEP(0);
363 U288_MULADD_STEP(1);
364 U288_MULADD_STEP(2);
365 U288_MULADD_STEP(3);
366 U288_MULADD_STEP(4);
367 U288_MULADD_STEP(5);
368 U288_MULADD_STEP(6);
369 U288_MULADD_STEP(7);
370 #else
371 for (unsigned i = 0; i < 8; i++) {
372 U288_MULADD_STEP(i);
373 }
374 #endif
375
376 uint64_t sum = (uint64_t) z[8] + carry;
377 z[8] = (uint32_t) sum;
378 carry = (uint32_t) (sum >> 32);
379
380 return carry;
381 }
382
383 /*
384 * 288-bit in-place right shift by 32 bits
385 *
386 * in: z in [0, 2^288)
387 * c in [0, 2^32)
388 * out: z_out = z_in div 2^32 + c * 2^256
389 * = (z_in + c * 2^288) div 2^32
390 *
391 * This is a helper for Montgomery multiplication.
392 */
u288_rshift32(uint32_t z[9],uint32_t c)393 static void u288_rshift32(uint32_t z[9], uint32_t c)
394 {
395 for (unsigned i = 0; i < 8; i++) {
396 z[i] = z[i + 1];
397 }
398 z[8] = c;
399 }
400
401 /*
402 * 256-bit import from big-endian bytes
403 *
404 * in: p = p0, ..., p31
405 * out: z = p0 * 2^248 + p1 * 2^240 + ... + p30 * 2^8 + p31
406 */
u256_from_bytes(uint32_t z[8],const uint8_t p[32])407 static void u256_from_bytes(uint32_t z[8], const uint8_t p[32])
408 {
409 for (unsigned i = 0; i < 8; i++) {
410 unsigned j = 4 * (7 - i);
411 z[i] = ((uint32_t) p[j + 0] << 24) |
412 ((uint32_t) p[j + 1] << 16) |
413 ((uint32_t) p[j + 2] << 8) |
414 ((uint32_t) p[j + 3] << 0);
415 }
416 }
417
418 /*
419 * 256-bit export to big-endian bytes
420 *
421 * in: z in [0, 2^256)
422 * out: p = p0, ..., p31 such that
423 * z = p0 * 2^248 + p1 * 2^240 + ... + p30 * 2^8 + p31
424 */
u256_to_bytes(uint8_t p[32],const uint32_t z[8])425 static void u256_to_bytes(uint8_t p[32], const uint32_t z[8])
426 {
427 for (unsigned i = 0; i < 8; i++) {
428 unsigned j = 4 * (7 - i);
429 p[j + 0] = (uint8_t) (z[i] >> 24);
430 p[j + 1] = (uint8_t) (z[i] >> 16);
431 p[j + 2] = (uint8_t) (z[i] >> 8);
432 p[j + 3] = (uint8_t) (z[i] >> 0);
433 }
434 }
435
436 /**********************************************************************
437 *
438 * Operations modulo a 256-bit prime m
439 *
440 * These are done in the Montgomery domain, that is x is represented by
441 * x * 2^256 mod m
442 * Numbers need to be converted to that domain before computations,
443 * and back from it afterwards.
444 *
445 * Inversion is computed using Fermat's little theorem.
446 *
447 * Assumptions on m:
448 * - Montgomery operations require that m is odd.
449 * - Fermat's little theorem require it to be a prime.
450 * - m256_inv() further requires that m % 2^32 >= 2.
451 * - m256_inv() also assumes that the value of m is not a secret.
452 *
453 * In practice operations are done modulo the curve's p and n,
454 * both of which satisfy those assumptions.
455 *
456 **********************************************************************/
457
458 /*
459 * Data associated to a modulus for Montgomery operations.
460 *
461 * m in [0, 2^256) - the modulus itself, must be odd
462 * R2 = 2^512 mod m
463 * ni = -m^-1 mod 2^32
464 */
465 typedef struct {
466 uint32_t m[8];
467 uint32_t R2[8];
468 uint32_t ni;
469 }
470 m256_mod;
471
472 /*
473 * Data for Montgomery operations modulo the curve's p
474 */
475 static const m256_mod p256_p = {
476 { /* the curve's p */
477 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
478 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF,
479 },
480 { /* 2^512 mod p */
481 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb,
482 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004,
483 },
484 0x00000001, /* -p^-1 mod 2^32 */
485 };
486
487 /*
488 * Data for Montgomery operations modulo the curve's n
489 */
490 static const m256_mod p256_n = {
491 { /* the curve's n */
492 0xFC632551, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD,
493 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
494 },
495 { /* 2^512 mod n */
496 0xbe79eea2, 0x83244c95, 0x49bd6fa6, 0x4699799c,
497 0x2b6bec59, 0x2845b239, 0xf3d95620, 0x66e12d94,
498 },
499 0xee00bc4f, /* -n^-1 mod 2^32 */
500 };
501
502 /*
503 * Modular addition
504 *
505 * in: x, y in [0, m)
506 * mod must point to a valid m256_mod structure
507 * out: z = (x + y) mod m, in [0, m)
508 *
509 * Note: as a memory area, z must be either equal to x or y, or not overlap.
510 */
m256_add(uint32_t z[8],const uint32_t x[8],const uint32_t y[8],const m256_mod * mod)511 static void m256_add(uint32_t z[8],
512 const uint32_t x[8], const uint32_t y[8],
513 const m256_mod *mod)
514 {
515 uint32_t r[8];
516 uint32_t carry_add = u256_add(z, x, y);
517 uint32_t carry_sub = u256_sub(r, z, mod->m);
518 /* Need to subract m if:
519 * x+y >= 2^256 > m (that is, carry_add == 1)
520 * OR z >= m (that is, carry_sub == 0) */
521 uint32_t use_sub = carry_add | (1 - carry_sub);
522 u256_cmov(z, r, use_sub);
523 }
524
525 /*
526 * Modular addition mod p
527 *
528 * in: x, y in [0, p)
529 * out: z = (x + y) mod p, in [0, p)
530 *
531 * Note: as a memory area, z must be either equal to x or y, or not overlap.
532 */
m256_add_p(uint32_t z[8],const uint32_t x[8],const uint32_t y[8])533 static void m256_add_p(uint32_t z[8],
534 const uint32_t x[8], const uint32_t y[8])
535 {
536 m256_add(z, x, y, &p256_p);
537 }
538
539 /*
540 * Modular subtraction
541 *
542 * in: x, y in [0, m)
543 * mod must point to a valid m256_mod structure
544 * out: z = (x - y) mod m, in [0, m)
545 *
546 * Note: as a memory area, z must be either equal to x or y, or not overlap.
547 */
m256_sub(uint32_t z[8],const uint32_t x[8],const uint32_t y[8],const m256_mod * mod)548 static void m256_sub(uint32_t z[8],
549 const uint32_t x[8], const uint32_t y[8],
550 const m256_mod *mod)
551 {
552 uint32_t r[8];
553 uint32_t carry = u256_sub(z, x, y);
554 (void) u256_add(r, z, mod->m);
555 /* Need to add m if and only if x < y, that is carry == 1.
556 * In that case z is in [2^256 - m + 1, 2^256 - 1], so the
557 * addition will have a carry as well, which cancels out. */
558 u256_cmov(z, r, carry);
559 }
560
561 /*
562 * Modular subtraction mod p
563 *
564 * in: x, y in [0, p)
565 * out: z = (x + y) mod p, in [0, p)
566 *
567 * Note: as a memory area, z must be either equal to x or y, or not overlap.
568 */
m256_sub_p(uint32_t z[8],const uint32_t x[8],const uint32_t y[8])569 static void m256_sub_p(uint32_t z[8],
570 const uint32_t x[8], const uint32_t y[8])
571 {
572 m256_sub(z, x, y, &p256_p);
573 }
574
575 /*
576 * Montgomery modular multiplication
577 *
578 * in: x, y in [0, m)
579 * mod must point to a valid m256_mod structure
580 * out: z = (x * y) / 2^256 mod m, in [0, m)
581 *
582 * Note: as a memory area, z may overlap with x or y.
583 */
m256_mul(uint32_t z[8],const uint32_t x[8],const uint32_t y[8],const m256_mod * mod)584 static void m256_mul(uint32_t z[8],
585 const uint32_t x[8], const uint32_t y[8],
586 const m256_mod *mod)
587 {
588 /*
589 * Algorithm 14.36 in Handbook of Applied Cryptography with:
590 * b = 2^32, n = 8, R = 2^256
591 */
592 uint32_t m_prime = mod->ni;
593 uint32_t a[9];
594
595 for (unsigned i = 0; i < 9; i++) {
596 a[i] = 0;
597 }
598
599 for (unsigned i = 0; i < 8; i++) {
600 /* the "mod 2^32" is implicit from the type */
601 uint32_t u = (a[0] + x[i] * y[0]) * m_prime;
602
603 /* a = (a + x[i] * y + u * m) div b */
604 uint32_t c = u288_muladd(a, x[i], y);
605 c += u288_muladd(a, u, mod->m);
606 u288_rshift32(a, c);
607 }
608
609 /* a = a > m ? a - m : a */
610 uint32_t carry_add = a[8]; // 0 or 1 since a < 2m, see HAC Note 14.37
611 uint32_t carry_sub = u256_sub(z, a, mod->m);
612 uint32_t use_sub = carry_add | (1 - carry_sub); // see m256_add()
613 u256_cmov(z, a, 1 - use_sub);
614 }
615
616 /*
617 * Montgomery modular multiplication modulo p.
618 *
619 * in: x, y in [0, p)
620 * out: z = (x * y) / 2^256 mod p, in [0, p)
621 *
622 * Note: as a memory area, z may overlap with x or y.
623 */
m256_mul_p(uint32_t z[8],const uint32_t x[8],const uint32_t y[8])624 static void m256_mul_p(uint32_t z[8],
625 const uint32_t x[8], const uint32_t y[8])
626 {
627 m256_mul(z, x, y, &p256_p);
628 }
629
630 /*
631 * In-place conversion to Montgomery form
632 *
633 * in: z in [0, m)
634 * mod must point to a valid m256_mod structure
635 * out: z_out = z_in * 2^256 mod m, in [0, m)
636 */
m256_prep(uint32_t z[8],const m256_mod * mod)637 static void m256_prep(uint32_t z[8], const m256_mod *mod)
638 {
639 m256_mul(z, z, mod->R2, mod);
640 }
641
642 /*
643 * In-place conversion from Montgomery form
644 *
645 * in: z in [0, m)
646 * mod must point to a valid m256_mod structure
647 * out: z_out = z_in / 2^256 mod m, in [0, m)
648 * That is, z_in was z_actual * 2^256 mod m, and z_out is z_actual
649 */
m256_done(uint32_t z[8],const m256_mod * mod)650 static void m256_done(uint32_t z[8], const m256_mod *mod)
651 {
652 uint32_t one[8];
653 u256_set32(one, 1);
654 m256_mul(z, z, one, mod);
655 }
656
657 /*
658 * Set to 32-bit value
659 *
660 * in: x in [0, 2^32)
661 * mod must point to a valid m256_mod structure
662 * out: z = x * 2^256 mod m, in [0, m)
663 * That is, z is set to the image of x in the Montgomery domain.
664 */
m256_set32(uint32_t z[8],uint32_t x,const m256_mod * mod)665 static void m256_set32(uint32_t z[8], uint32_t x, const m256_mod *mod)
666 {
667 u256_set32(z, x);
668 m256_prep(z, mod);
669 }
670
671 /*
672 * Modular inversion in Montgomery form
673 *
674 * in: x in [0, m)
675 * mod must point to a valid m256_mod structure
676 * such that mod->m % 2^32 >= 2, assumed to be public.
677 * out: z = x^-1 * 2^512 mod m if x != 0,
678 * z = 0 if x == 0
679 * That is, if x = x_actual * 2^256 mod m, then
680 * z = x_actual^-1 * 2^256 mod m
681 *
682 * Note: as a memory area, z may overlap with x.
683 */
m256_inv(uint32_t z[8],const uint32_t x[8],const m256_mod * mod)684 static void m256_inv(uint32_t z[8], const uint32_t x[8],
685 const m256_mod *mod)
686 {
687 /*
688 * Use Fermat's little theorem to compute x^-1 as x^(m-2).
689 *
690 * Take advantage of the fact that both p's and n's least significant limb
691 * is at least 2 to perform the subtraction on the flight (no carry).
692 *
693 * Use plain right-to-left binary exponentiation;
694 * branches are OK as the exponent is not a secret.
695 */
696 uint32_t bitval[8];
697 u256_cmov(bitval, x, 1); /* copy x before writing to z */
698
699 m256_set32(z, 1, mod);
700
701 unsigned i = 0;
702 uint32_t limb = mod->m[i] - 2;
703 while (1) {
704 for (unsigned j = 0; j < 32; j++) {
705 if ((limb & 1) != 0) {
706 m256_mul(z, z, bitval, mod);
707 }
708 m256_mul(bitval, bitval, bitval, mod);
709 limb >>= 1;
710 }
711
712 if (i == 7)
713 break;
714
715 i++;
716 limb = mod->m[i];
717 }
718 }
719
720 /*
721 * Import modular integer from bytes to Montgomery domain
722 *
723 * in: p = p0, ..., p32
724 * mod must point to a valid m256_mod structure
725 * out: z = (p0 * 2^248 + ... + p31) * 2^256 mod m, in [0, m)
726 * return 0 if the number was already in [0, m), or -1.
727 * z may be incorrect and must be discared when -1 is returned.
728 */
m256_from_bytes(uint32_t z[8],const uint8_t p[32],const m256_mod * mod)729 static int m256_from_bytes(uint32_t z[8],
730 const uint8_t p[32], const m256_mod *mod)
731 {
732 u256_from_bytes(z, p);
733
734 uint32_t t[8];
735 uint32_t lt_m = u256_sub(t, z, mod->m);
736 if (lt_m != 1)
737 return -1;
738
739 m256_prep(z, mod);
740 return 0;
741 }
742
743 /*
744 * Export modular integer from Montgomery domain to bytes
745 *
746 * in: z in [0, 2^256)
747 * mod must point to a valid m256_mod structure
748 * out: p = p0, ..., p31 such that
749 * z = (p0 * 2^248 + ... + p31) * 2^256 mod m
750 */
m256_to_bytes(uint8_t p[32],const uint32_t z[8],const m256_mod * mod)751 static void m256_to_bytes(uint8_t p[32],
752 const uint32_t z[8], const m256_mod *mod)
753 {
754 uint32_t zi[8];
755 u256_cmov(zi, z, 1);
756 m256_done(zi, mod);
757
758 u256_to_bytes(p, zi);
759 }
760
761 /**********************************************************************
762 *
763 * Operations on curve points
764 *
765 * Points are represented in two coordinates system:
766 * - affine (x, y) - extended to represent 0 (see below)
767 * - jacobian (x:y:z)
768 * In either case, coordinates are integers modulo p256_p and
769 * are always represented in the Montgomery domain.
770 *
771 * For background on jacobian coordinates, see for example [GECC] 3.2.2:
772 * - conversions go (x, y) -> (x:y:1) and (x:y:z) -> (x/z^2, y/z^3)
773 * - the curve equation becomes y^2 = x^3 - 3 x z^4 + b z^6
774 * - 0 (aka the origin aka point at infinity) is (x:y:0) with y^2 = x^3.
775 * - point negation goes -(x:y:z) = (x:-y:z)
776 *
777 * Normally 0 (the point at infinity) can't be represented in affine
778 * coordinates. However we extend affine coordinates with the convention that
779 * (0, 0) (which is normally not a point on the curve) is interpreted as 0.
780 *
781 * References:
782 * - [GECC]: Guide to Elliptic Curve Cryptography; Hankerson, Menezes,
783 * Vanstone; Springer, 2004.
784 * - [CMO98]: Efficient Elliptic Curve Exponentiation Using Mixed Coordinates;
785 * Cohen, Miyaji, Ono; Springer, ASIACRYPT 1998.
786 * https://link.springer.com/content/pdf/10.1007/3-540-49649-1_6.pdf
787 * - [RCB15]: Complete addition formulas for prime order elliptic curves;
788 * Renes, Costello, Batina; IACR e-print 2015-1060.
789 * https://eprint.iacr.org/2015/1060.pdf
790 *
791 **********************************************************************/
792
793 /*
794 * The curve's b parameter in the Short Weierstrass equation
795 * y^2 = x^3 - 3*x + b
796 * Compared to the standard, this is converted to the Montgomery domain.
797 */
798 static const uint32_t p256_b[8] = { /* b * 2^256 mod p */
799 0x29c4bddf, 0xd89cdf62, 0x78843090, 0xacf005cd,
800 0xf7212ed6, 0xe5a220ab, 0x04874834, 0xdc30061d,
801 };
802
803 /*
804 * The curve's conventional base point G.
805 * Compared to the standard, coordinates converted to the Montgomery domain.
806 */
807 static const uint32_t p256_gx[8] = { /* G_x * 2^256 mod p */
808 0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc,
809 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76,
810 };
811 static const uint32_t p256_gy[8] = { /* G_y * 2^256 mod p */
812 0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4,
813 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18,
814 };
815
816 /*
817 * Point-on-curve check - do the coordinates satisfy the curve's equation?
818 *
819 * in: x, y in [0, p) (Montgomery domain)
820 * out: 0 if the point lies on the curve and is not 0,
821 * unspecified non-zero otherwise
822 */
point_check(const uint32_t x[8],const uint32_t y[8])823 static uint32_t point_check(const uint32_t x[8], const uint32_t y[8])
824 {
825 uint32_t lhs[8], rhs[8];
826
827 /* lhs = y^2 */
828 m256_mul_p(lhs, y, y);
829
830 /* rhs = x^3 - 3x + b */
831 m256_mul_p(rhs, x, x); /* x^2 */
832 m256_mul_p(rhs, rhs, x); /* x^3 */
833 for (unsigned i = 0; i < 3; i++)
834 m256_sub_p(rhs, rhs, x); /* x^3 - 3x */
835 m256_add_p(rhs, rhs, p256_b); /* x^3 - 3x + b */
836
837 return u256_diff(lhs, rhs);
838 }
839
840 /*
841 * In-place jacobian to affine coordinate conversion
842 *
843 * in: (x:y:z) must be on the curve (coordinates in Montegomery domain)
844 * out: x_out = x_in / z_in^2 (Montgomery domain)
845 * y_out = y_in / z_in^3 (Montgomery domain)
846 * z_out unspecified, must be disregarded
847 *
848 * Note: if z is 0 (that is, the input point is 0), x_out = y_out = 0.
849 */
point_to_affine(uint32_t x[8],uint32_t y[8],uint32_t z[8])850 static void point_to_affine(uint32_t x[8], uint32_t y[8], uint32_t z[8])
851 {
852 uint32_t t[8];
853
854 m256_inv(z, z, &p256_p); /* z = z^-1 */
855
856 m256_mul_p(t, z, z); /* t = z^-2 */
857 m256_mul_p(x, x, t); /* x = x * z^-2 */
858
859 m256_mul_p(t, t, z); /* t = z^-3 */
860 m256_mul_p(y, y, t); /* y = y * z^-3 */
861 }
862
863 /*
864 * In-place point doubling in jacobian coordinates (Montgomery domain)
865 *
866 * in: P_in = (x:y:z), must be on the curve
867 * out: (x:y:z) = P_out = 2 * P_in
868 */
point_double(uint32_t x[8],uint32_t y[8],uint32_t z[8])869 static void point_double(uint32_t x[8], uint32_t y[8], uint32_t z[8])
870 {
871 /*
872 * This is formula 6 from [CMO98], cited as complete in [RCB15] (table 1).
873 * Notations as in the paper, except u added and t ommited (it's x3).
874 */
875 uint32_t m[8], s[8], u[8];
876
877 /* m = 3 * x^2 + a * z^4 = 3 * (x + z^2) * (x - z^2) */
878 m256_mul_p(s, z, z);
879 m256_add_p(m, x, s);
880 m256_sub_p(u, x, s);
881 m256_mul_p(s, m, u);
882 m256_add_p(m, s, s);
883 m256_add_p(m, m, s);
884
885 /* s = 4 * x * y^2 */
886 m256_mul_p(u, y, y);
887 m256_add_p(u, u, u); /* u = 2 * y^2 (used below) */
888 m256_mul_p(s, x, u);
889 m256_add_p(s, s, s);
890
891 /* u = 8 * y^4 (not named in the paper, first term of y3) */
892 m256_mul_p(u, u, u);
893 m256_add_p(u, u, u);
894
895 /* x3 = t = m^2 - 2 * s */
896 m256_mul_p(x, m, m);
897 m256_sub_p(x, x, s);
898 m256_sub_p(x, x, s);
899
900 /* z3 = 2 * y * z */
901 m256_mul_p(z, y, z);
902 m256_add_p(z, z, z);
903
904 /* y3 = -u + m * (s - t) */
905 m256_sub_p(y, s, x);
906 m256_mul_p(y, y, m);
907 m256_sub_p(y, y, u);
908 }
909
910 /*
911 * In-place point addition in jacobian-affine coordinates (Montgomery domain)
912 *
913 * in: P_in = (x1:y1:z1), must be on the curve and not 0
914 * Q = (x2, y2), must be on the curve and not P_in or -P_in or 0
915 * out: P_out = (x1:y1:z1) = P_in + Q
916 */
point_add(uint32_t x1[8],uint32_t y1[8],uint32_t z1[8],const uint32_t x2[8],const uint32_t y2[8])917 static void point_add(uint32_t x1[8], uint32_t y1[8], uint32_t z1[8],
918 const uint32_t x2[8], const uint32_t y2[8])
919 {
920 /*
921 * This is formula 5 from [CMO98], with z2 == 1 substituted. We use
922 * intermediates with neutral names, and names from the paper in comments.
923 */
924 uint32_t t1[8], t2[8], t3[8];
925
926 /* u1 = x1 and s1 = y1 (no computations) */
927
928 /* t1 = u2 = x2 z1^2 */
929 m256_mul_p(t1, z1, z1);
930 m256_mul_p(t2, t1, z1);
931 m256_mul_p(t1, t1, x2);
932
933 /* t2 = s2 = y2 z1^3 */
934 m256_mul_p(t2, t2, y2);
935
936 /* t1 = h = u2 - u1 */
937 m256_sub_p(t1, t1, x1); /* t1 = x2 * z1^2 - x1 */
938
939 /* t2 = r = s2 - s1 */
940 m256_sub_p(t2, t2, y1);
941
942 /* z3 = z1 * h */
943 m256_mul_p(z1, z1, t1);
944
945 /* t1 = h^3 */
946 m256_mul_p(t3, t1, t1);
947 m256_mul_p(t1, t3, t1);
948
949 /* t3 = x1 * h^2 */
950 m256_mul_p(t3, t3, x1);
951
952 /* x3 = r^2 - 2 * x1 * h^2 - h^3 */
953 m256_mul_p(x1, t2, t2);
954 m256_sub_p(x1, x1, t3);
955 m256_sub_p(x1, x1, t3);
956 m256_sub_p(x1, x1, t1);
957
958 /* y3 = r * (x1 * h^2 - x3) - y1 h^3 */
959 m256_sub_p(t3, t3, x1);
960 m256_mul_p(t3, t3, t2);
961 m256_mul_p(t1, t1, y1);
962 m256_sub_p(y1, t3, t1);
963 }
964
965 /*
966 * Point addition or doubling (affine to jacobian, Montgomery domain)
967 *
968 * in: P = (x1, y1) - must be on the curve and not 0
969 * Q = (x2, y2) - must be on the curve and not 0
970 * out: (x3, y3) = R = P + Q
971 *
972 * Note: unlike point_add(), this function works if P = +- Q;
973 * however it leaks information on its input through timing,
974 * branches taken and memory access patterns (if observable).
975 */
point_add_or_double_leaky(uint32_t x3[8],uint32_t y3[8],const uint32_t x1[8],const uint32_t y1[8],const uint32_t x2[8],const uint32_t y2[8])976 static void point_add_or_double_leaky(
977 uint32_t x3[8], uint32_t y3[8],
978 const uint32_t x1[8], const uint32_t y1[8],
979 const uint32_t x2[8], const uint32_t y2[8])
980 {
981
982 uint32_t z3[8];
983 u256_cmov(x3, x1, 1);
984 u256_cmov(y3, y1, 1);
985 m256_set32(z3, 1, &p256_p);
986
987 if (u256_diff(x1, x2) != 0) {
988 // P != +- Q -> generic addition
989 point_add(x3, y3, z3, x2, y2);
990 point_to_affine(x3, y3, z3);
991 }
992 else if (u256_diff(y1, y2) == 0) {
993 // P == Q -> double
994 point_double(x3, y3, z3);
995 point_to_affine(x3, y3, z3);
996 } else {
997 // P == -Q -> zero
998 m256_set32(x3, 0, &p256_p);
999 m256_set32(y3, 0, &p256_p);
1000 }
1001 }
1002
1003 /*
1004 * Import curve point from bytes
1005 *
1006 * in: p = (x, y) concatenated, fixed-width 256-bit big-endian integers
1007 * out: x, y in Mongomery domain
1008 * return 0 if x and y are both in [0, p)
1009 * and (x, y) is on the curve and not 0
1010 * unspecified non-zero otherwise.
1011 * x and y are unspecified and must be discarded if returning non-zero.
1012 */
point_from_bytes(uint32_t x[8],uint32_t y[8],const uint8_t p[64])1013 static int point_from_bytes(uint32_t x[8], uint32_t y[8], const uint8_t p[64])
1014 {
1015 int ret;
1016
1017 ret = m256_from_bytes(x, p, &p256_p);
1018 if (ret != 0)
1019 return ret;
1020
1021 ret = m256_from_bytes(y, p + 32, &p256_p);
1022 if (ret != 0)
1023 return ret;
1024
1025 return (int) point_check(x, y);
1026 }
1027
1028 /*
1029 * Export curve point to bytes
1030 *
1031 * in: x, y affine coordinates of a point (Montgomery domain)
1032 * must be on the curve and not 0
1033 * out: p = (x, y) concatenated, fixed-width 256-bit big-endian integers
1034 */
point_to_bytes(uint8_t p[64],const uint32_t x[8],const uint32_t y[8])1035 static void point_to_bytes(uint8_t p[64],
1036 const uint32_t x[8], const uint32_t y[8])
1037 {
1038 m256_to_bytes(p, x, &p256_p);
1039 m256_to_bytes(p + 32, y, &p256_p);
1040 }
1041
1042 /**********************************************************************
1043 *
1044 * Scalar multiplication and other scalar-related operations
1045 *
1046 **********************************************************************/
1047
1048 /*
1049 * Scalar multiplication
1050 *
1051 * in: P = (px, py), affine (Montgomery), must be on the curve and not 0
1052 * s in [1, n-1]
1053 * out: R = s * P = (rx, ry), affine coordinates (Montgomery).
1054 *
1055 * Note: as memory areas, none of the parameters may overlap.
1056 */
scalar_mult(uint32_t rx[8],uint32_t ry[8],const uint32_t px[8],const uint32_t py[8],const uint32_t s[8])1057 static void scalar_mult(uint32_t rx[8], uint32_t ry[8],
1058 const uint32_t px[8], const uint32_t py[8],
1059 const uint32_t s[8])
1060 {
1061 /*
1062 * We use a signed binary ladder, see for example slides 10-14 of
1063 * http://ecc2015.math.u-bordeaux1.fr/documents/hamburg.pdf but with
1064 * implicit recoding, and a different loop initialisation to avoid feeding
1065 * 0 to our addition formulas, as they don't support it.
1066 */
1067 uint32_t s_odd[8], py_neg[8], py_use[8], rz[8];
1068
1069 /*
1070 * Make s odd by replacing it with n - s if necessary.
1071 *
1072 * If s was odd, we'll have s_odd = s, and define P' = P.
1073 * Otherwise, we'll have s_odd = n - s and define P' = -P.
1074 *
1075 * Either way, we can compute s * P as s_odd * P'.
1076 */
1077 u256_sub(s_odd, p256_n.m, s); /* no carry, result still in [1, n-1] */
1078 uint32_t negate = ~s[0] & 1;
1079 u256_cmov(s_odd, s, 1 - negate);
1080
1081 /* Compute py_neg = - py mod p (that's the y coordinate of -P) */
1082 u256_set32(py_use, 0);
1083 m256_sub_p(py_neg, py_use, py);
1084
1085 /* Initialize R = P' = (x:(-1)^negate * y:1) */
1086 u256_cmov(rx, px, 1);
1087 u256_cmov(ry, py, 1);
1088 m256_set32(rz, 1, &p256_p);
1089 u256_cmov(ry, py_neg, negate);
1090
1091 /*
1092 * For any odd number s_odd = b255 ... b1 1, we have
1093 * s_odd = 2^255 + 2^254 sbit(b255) + ... + 2 sbit(b2) + sbit(b1)
1094 * writing
1095 * sbit(b) = 2 * b - 1 = b ? 1 : -1
1096 *
1097 * Use that to compute s_odd * P' by repeating R = 2 * R +- P':
1098 * s_odd * P' = 2 * ( ... (2 * P' + sbit(b255) P') ... ) + sbit(b1) P'
1099 *
1100 * The loop invariant is that when beginning an iteration we have
1101 * R = s_i P'
1102 * with
1103 * s_i = 2^(255-i) + 2^(254-i) sbit(b_255) + ...
1104 * where the sum has 256 - i terms.
1105 *
1106 * When updating R we need to make sure the input to point_add() is
1107 * neither 0 not +-P'. Since that input is 2 s_i P', it is sufficient to
1108 * see that 1 < 2 s_i < n-1. The lower bound is obvious since s_i is a
1109 * positive integer, and for the upper bound we distinguish three cases.
1110 *
1111 * If i > 1, then s_i < 2^254, so 2 s_i < 2^255 < n-1.
1112 * Otherwise, i == 1 and we have 2 s_i = s_odd - sbit(b1).
1113 * If s_odd <= n-4, then 2 s_1 <= n-3.
1114 * Otherwise, s_odd = n-2, and for this curve's value of n,
1115 * we have b1 == 1, so sbit(b1) = 1 and 2 s_1 <= n-3.
1116 */
1117 for (unsigned i = 255; i > 0; i--) {
1118 uint32_t bit = (s_odd[i / 32] >> i % 32) & 1;
1119
1120 /* set (px, py_use) = sbit(bit) P' = sbit(bit) * (-1)^negate P */
1121 u256_cmov(py_use, py, bit ^ negate);
1122 u256_cmov(py_use, py_neg, (1 - bit) ^ negate);
1123
1124 /* Update R = 2 * R +- P' */
1125 point_double(rx, ry, rz);
1126 point_add(rx, ry, rz, px, py_use);
1127 }
1128
1129 point_to_affine(rx, ry, rz);
1130 }
1131
1132 /*
1133 * Scalar import from big-endian bytes
1134 *
1135 * in: p = p0, ..., p31
1136 * out: s = p0 * 2^248 + p1 * 2^240 + ... + p30 * 2^8 + p31
1137 * return 0 if s in [1, n-1],
1138 * -1 otherwise.
1139 */
scalar_from_bytes(uint32_t s[8],const uint8_t p[32])1140 static int scalar_from_bytes(uint32_t s[8], const uint8_t p[32])
1141 {
1142 u256_from_bytes(s, p);
1143
1144 uint32_t r[8];
1145 uint32_t lt_n = u256_sub(r, s, p256_n.m);
1146
1147 u256_set32(r, 1);
1148 uint32_t lt_1 = u256_sub(r, s, r);
1149
1150 if (lt_n && !lt_1)
1151 return 0;
1152
1153 return -1;
1154 }
1155
1156 /* Using RNG functions from Mbed TLS as p256-m does not come with a
1157 * cryptographically secure RNG function.
1158 */
p256_generate_random(uint8_t * output,unsigned output_size)1159 int p256_generate_random(uint8_t *output, unsigned output_size)
1160 {
1161 int ret;
1162 ret = psa_generate_random(output, output_size);
1163
1164 if (ret != 0){
1165 return P256_RANDOM_FAILED;
1166 }
1167 return P256_SUCCESS;
1168 }
1169
1170 /*
1171 * Scalar generation, with public key
1172 *
1173 * out: sbytes the big-endian bytes representation of the scalar
1174 * s its u256 representation
1175 * x, y the affine coordinates of s * G (Montgomery domain)
1176 * return 0 if OK, -1 on failure
1177 * sbytes, s, x, y must be discarded when returning non-zero.
1178 */
scalar_gen_with_pub(uint8_t sbytes[32],uint32_t s[8],uint32_t x[8],uint32_t y[8])1179 static int scalar_gen_with_pub(uint8_t sbytes[32], uint32_t s[8],
1180 uint32_t x[8], uint32_t y[8])
1181 {
1182 /* generate a random valid scalar */
1183 int ret;
1184 unsigned nb_tried = 0;
1185 do {
1186 if (nb_tried++ >= 4)
1187 return -1;
1188
1189 ret = p256_generate_random(sbytes, 32);
1190 CT_POISON(sbytes, 32);
1191 if (ret != 0)
1192 return -1;
1193
1194 ret = scalar_from_bytes(s, sbytes);
1195 CT_UNPOISON(&ret, sizeof ret);
1196 }
1197 while (ret != 0);
1198
1199 /* compute and ouput the associated public key */
1200 scalar_mult(x, y, p256_gx, p256_gy, s);
1201
1202 /* the associated public key is not a secret */
1203 CT_UNPOISON(x, 32);
1204 CT_UNPOISON(y, 32);
1205
1206 return 0;
1207 }
1208
1209 /*
1210 * ECDH/ECDSA generate pair
1211 */
p256_gen_keypair(uint8_t priv[32],uint8_t pub[64])1212 int p256_gen_keypair(uint8_t priv[32], uint8_t pub[64])
1213 {
1214 uint32_t s[8], x[8], y[8];
1215 int ret = scalar_gen_with_pub(priv, s, x, y);
1216 zeroize(s, sizeof s);
1217 if (ret != 0)
1218 return P256_RANDOM_FAILED;
1219
1220 point_to_bytes(pub, x, y);
1221 return 0;
1222 }
1223
1224 /**********************************************************************
1225 *
1226 * ECDH
1227 *
1228 **********************************************************************/
1229
1230 /*
1231 * ECDH compute shared secret
1232 */
p256_ecdh_shared_secret(uint8_t secret[32],const uint8_t priv[32],const uint8_t peer[64])1233 int p256_ecdh_shared_secret(uint8_t secret[32],
1234 const uint8_t priv[32], const uint8_t peer[64])
1235 {
1236 CT_POISON(priv, 32);
1237
1238 uint32_t s[8], px[8], py[8], x[8], y[8];
1239 int ret;
1240
1241 ret = scalar_from_bytes(s, priv);
1242 CT_UNPOISON(&ret, sizeof ret);
1243 if (ret != 0) {
1244 ret = P256_INVALID_PRIVKEY;
1245 goto cleanup;
1246 }
1247
1248 ret = point_from_bytes(px, py, peer);
1249 if (ret != 0) {
1250 ret = P256_INVALID_PUBKEY;
1251 goto cleanup;
1252 }
1253
1254 scalar_mult(x, y, px, py, s);
1255
1256 m256_to_bytes(secret, x, &p256_p);
1257 CT_UNPOISON(secret, 32);
1258
1259 cleanup:
1260 zeroize(s, sizeof s);
1261 return ret;
1262 }
1263
1264 /**********************************************************************
1265 *
1266 * ECDSA
1267 *
1268 * Reference:
1269 * [SEC1] SEC 1: Elliptic Curve Cryptography, Certicom research, 2009.
1270 * http://www.secg.org/sec1-v2.pdf
1271 **********************************************************************/
1272
1273 /*
1274 * Reduction mod n of a small number
1275 *
1276 * in: x in [0, 2^256)
1277 * out: x_out = x_in mod n in [0, n)
1278 */
ecdsa_m256_mod_n(uint32_t x[8])1279 static void ecdsa_m256_mod_n(uint32_t x[8])
1280 {
1281 uint32_t t[8];
1282 uint32_t c = u256_sub(t, x, p256_n.m);
1283 u256_cmov(x, t, 1 - c);
1284 }
1285
1286 /*
1287 * Import integer mod n (Montgomery domain) from hash
1288 *
1289 * in: h = h0, ..., h_hlen
1290 * hlen the length of h in bytes
1291 * out: z = (h0 * 2^l-8 + ... + h_l) * 2^256 mod n
1292 * with l = min(32, hlen)
1293 *
1294 * Note: in [SEC1] this is step 5 of 4.1.3 (sign) or step 3 or 4.1.4 (verify),
1295 * with obvious simplications since n's bit-length is a multiple of 8.
1296 */
ecdsa_m256_from_hash(uint32_t z[8],const uint8_t * h,size_t hlen)1297 static void ecdsa_m256_from_hash(uint32_t z[8],
1298 const uint8_t *h, size_t hlen)
1299 {
1300 /* convert from h (big-endian) */
1301 /* hlen is public data so it's OK to branch on it */
1302 if (hlen < 32) {
1303 uint8_t p[32];
1304 for (unsigned i = 0; i < 32; i++)
1305 p[i] = 0;
1306 for (unsigned i = 0; i < hlen; i++)
1307 p[32 - hlen + i] = h[i];
1308 u256_from_bytes(z, p);
1309 } else {
1310 u256_from_bytes(z, h);
1311 }
1312
1313 /* ensure the result is in [0, n) */
1314 ecdsa_m256_mod_n(z);
1315
1316 /* map to Montgomery domain */
1317 m256_prep(z, &p256_n);
1318 }
1319
1320 /*
1321 * ECDSA sign
1322 */
p256_ecdsa_sign(uint8_t sig[64],const uint8_t priv[32],const uint8_t * hash,size_t hlen)1323 int p256_ecdsa_sign(uint8_t sig[64], const uint8_t priv[32],
1324 const uint8_t *hash, size_t hlen)
1325 {
1326 CT_POISON(priv, 32);
1327
1328 /*
1329 * Steps and notations from [SEC1] 4.1.3
1330 *
1331 * Instead of retrying on r == 0 or s == 0, just abort,
1332 * as those events have negligible probability.
1333 */
1334 int ret;
1335
1336 /* Temporary buffers - the first two are mostly stable, so have names */
1337 uint32_t xr[8], k[8], t3[8], t4[8];
1338
1339 /* 1. Set ephemeral keypair */
1340 uint8_t *kb = (uint8_t *) t4;
1341 /* kb will be erased by re-using t4 for dU - if we exit before that, we
1342 * haven't read the private key yet so we kb isn't sensitive yet */
1343 ret = scalar_gen_with_pub(kb, k, xr, t3); /* xr = x_coord(k * G) */
1344 if (ret != 0)
1345 return P256_RANDOM_FAILED;
1346 m256_prep(k, &p256_n);
1347
1348 /* 2. Convert xr to an integer */
1349 m256_done(xr, &p256_p);
1350
1351 /* 3. Reduce xr mod n (extra: output it while at it) */
1352 ecdsa_m256_mod_n(xr); /* xr = int(xr) mod n */
1353
1354 /* xr is public data so it's OK to use a branch */
1355 if (u256_diff0(xr) == 0)
1356 return P256_RANDOM_FAILED;
1357
1358 u256_to_bytes(sig, xr);
1359
1360 m256_prep(xr, &p256_n);
1361
1362 /* 4. Skipped - we take the hash as an input, not the message */
1363
1364 /* 5. Derive an integer from the hash */
1365 ecdsa_m256_from_hash(t3, hash, hlen); /* t3 = e */
1366
1367 /* 6. Compute s = k^-1 * (e + r * dU) */
1368
1369 /* Note: dU will be erased by re-using t4 for the value of s (public) */
1370 ret = scalar_from_bytes(t4, priv); /* t4 = dU (integer domain) */
1371 CT_UNPOISON(&ret, sizeof ret); /* Result of input validation */
1372 if (ret != 0)
1373 return P256_INVALID_PRIVKEY;
1374 m256_prep(t4, &p256_n); /* t4 = dU (Montgomery domain) */
1375
1376 m256_inv(k, k, &p256_n); /* k^-1 */
1377 m256_mul(t4, xr, t4, &p256_n); /* t4 = r * dU */
1378 m256_add(t4, t3, t4, &p256_n); /* t4 = e + r * dU */
1379 m256_mul(t4, k, t4, &p256_n); /* t4 = s = k^-1 * (e + r * dU) */
1380 zeroize(k, sizeof k);
1381
1382 /* 7. Output s (r already outputed at step 3) */
1383 CT_UNPOISON(t4, 32);
1384 if (u256_diff0(t4) == 0) {
1385 /* undo early output of r */
1386 u256_to_bytes(sig, t4);
1387 return P256_RANDOM_FAILED;
1388 }
1389 m256_to_bytes(sig + 32, t4, &p256_n);
1390
1391 return P256_SUCCESS;
1392 }
1393
1394 /*
1395 * ECDSA verify
1396 */
p256_ecdsa_verify(const uint8_t sig[64],const uint8_t pub[64],const uint8_t * hash,size_t hlen)1397 int p256_ecdsa_verify(const uint8_t sig[64], const uint8_t pub[64],
1398 const uint8_t *hash, size_t hlen)
1399 {
1400 /*
1401 * Steps and notations from [SEC1] 4.1.3
1402 *
1403 * Note: we're using public data only, so branches are OK
1404 */
1405 int ret;
1406
1407 /* 1. Validate range of r and s : [1, n-1] */
1408 uint32_t r[8], s[8];
1409 ret = scalar_from_bytes(r, sig);
1410 if (ret != 0)
1411 return P256_INVALID_SIGNATURE;
1412 ret = scalar_from_bytes(s, sig + 32);
1413 if (ret != 0)
1414 return P256_INVALID_SIGNATURE;
1415
1416 /* 2. Skipped - we take the hash as an input, not the message */
1417
1418 /* 3. Derive an integer from the hash */
1419 uint32_t e[8];
1420 ecdsa_m256_from_hash(e, hash, hlen);
1421
1422 /* 4. Compute u1 = e * s^-1 and u2 = r * s^-1 */
1423 uint32_t u1[8], u2[8];
1424 m256_prep(s, &p256_n); /* s in Montgomery domain */
1425 m256_inv(s, s, &p256_n); /* s = s^-1 mod n */
1426 m256_mul(u1, e, s, &p256_n); /* u1 = e * s^-1 mod n */
1427 m256_done(u1, &p256_n); /* u1 out of Montgomery domain */
1428
1429 u256_cmov(u2, r, 1);
1430 m256_prep(u2, &p256_n); /* r in Montgomery domain */
1431 m256_mul(u2, u2, s, &p256_n); /* u2 = r * s^-1 mod n */
1432 m256_done(u2, &p256_n); /* u2 out of Montgomery domain */
1433
1434 /* 5. Compute R (and re-use (u1, u2) to store its coordinates */
1435 uint32_t px[8], py[8];
1436 ret = point_from_bytes(px, py, pub);
1437 if (ret != 0)
1438 return P256_INVALID_PUBKEY;
1439
1440 scalar_mult(e, s, px, py, u2); /* (e, s) = R2 = u2 * Qu */
1441
1442 if (u256_diff0(u1) == 0) {
1443 /* u1 out of range for scalar_mult() - just skip it */
1444 u256_cmov(u1, e, 1);
1445 /* we don't care about the y coordinate */
1446 } else {
1447 scalar_mult(px, py, p256_gx, p256_gy, u1); /* (px, py) = R1 = u1 * G */
1448
1449 /* (u1, u2) = R = R1 + R2 */
1450 point_add_or_double_leaky(u1, u2, px, py, e, s);
1451 /* No need to check if R == 0 here: if that's the case, it will be
1452 * caught when comparating rx (which will be 0) to r (which isn't). */
1453 }
1454
1455 /* 6. Convert xR to an integer */
1456 m256_done(u1, &p256_p);
1457
1458 /* 7. Reduce xR mod n */
1459 ecdsa_m256_mod_n(u1);
1460
1461 /* 8. Compare xR mod n to r */
1462 uint32_t diff = u256_diff(u1, r);
1463 if (diff == 0)
1464 return P256_SUCCESS;
1465
1466 return P256_INVALID_SIGNATURE;
1467 }
1468
1469 /**********************************************************************
1470 *
1471 * Key management utilities
1472 *
1473 **********************************************************************/
1474
p256_validate_pubkey(const uint8_t pub[64])1475 int p256_validate_pubkey(const uint8_t pub[64])
1476 {
1477 uint32_t x[8], y[8];
1478 int ret = point_from_bytes(x, y, pub);
1479
1480 return ret == 0 ? P256_SUCCESS : P256_INVALID_PUBKEY;
1481 }
1482
p256_validate_privkey(const uint8_t priv[32])1483 int p256_validate_privkey(const uint8_t priv[32])
1484 {
1485 uint32_t s[8];
1486 int ret = scalar_from_bytes(s, priv);
1487 zeroize(s, sizeof(s));
1488
1489 return ret == 0 ? P256_SUCCESS : P256_INVALID_PRIVKEY;
1490 }
1491
p256_public_from_private(uint8_t pub[64],const uint8_t priv[32])1492 int p256_public_from_private(uint8_t pub[64], const uint8_t priv[32])
1493 {
1494 int ret;
1495 uint32_t s[8];
1496
1497 ret = scalar_from_bytes(s, priv);
1498 if (ret != 0)
1499 return P256_INVALID_PRIVKEY;
1500
1501 /* compute and ouput the associated public key */
1502 uint32_t x[8], y[8];
1503 scalar_mult(x, y, p256_gx, p256_gy, s);
1504
1505 /* the associated public key is not a secret, the scalar was */
1506 CT_UNPOISON(x, 32);
1507 CT_UNPOISON(y, 32);
1508 zeroize(s, sizeof(s));
1509
1510 point_to_bytes(pub, x, y);
1511 return P256_SUCCESS;
1512 }
1513
1514 #endif
1515