1 // The MIT License (MIT)
2 //
3 // Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file).
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in all
13 // copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 // SOFTWARE.
22 
23 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
24 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
25 // public domain but parts have been replaced with code generated by Fiat
26 // (https://github.com/mit-plv/fiat-crypto), which is MIT licensed.
27 //
28 // The field functions are shared by Ed25519 and X25519 where possible.
29 
30 #include <assert.h>
31 #include <string.h>
32 #include <stdint.h>
33 
34 #include <mcuboot_config/mcuboot_config.h>
35 
36 #if defined(MCUBOOT_USE_MBED_TLS)
37 #include <mbedtls/platform_util.h>
38 #include <mbedtls/sha512.h>
39 #include <mbedtls/version.h>
40     #if MBEDTLS_VERSION_NUMBER >= 0x03000000
41         #include <mbedtls/compat-2.x.h>
42     #endif
43 #else
44 #include <tinycrypt/constants.h>
45 #include <tinycrypt/utils.h>
46 #include <tinycrypt/sha512.h>
47 #endif
48 
49 #include "curve25519.h"
50 // Various pre-computed constants.
51 #include "curve25519_tables.h"
52 
53 #define SHA512_DIGEST_LENGTH 64
54 
55 // Low-level intrinsic operations
56 
load_3(const uint8_t * in)57 static uint64_t load_3(const uint8_t *in) {
58   uint64_t result;
59   result = (uint64_t)in[0];
60   result |= ((uint64_t)in[1]) << 8;
61   result |= ((uint64_t)in[2]) << 16;
62   return result;
63 }
64 
load_4(const uint8_t * in)65 static uint64_t load_4(const uint8_t *in) {
66   uint64_t result;
67   result = (uint64_t)in[0];
68   result |= ((uint64_t)in[1]) << 8;
69   result |= ((uint64_t)in[2]) << 16;
70   result |= ((uint64_t)in[3]) << 24;
71   return result;
72 }
73 
74 
75 // Field operations.
76 
77 typedef uint32_t fe_limb_t;
78 #define FE_NUM_LIMBS 10
79 
80 // assert_fe asserts that |f| satisfies bounds:
81 //
82 //  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
83 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
84 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
85 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
86 //   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
87 //
88 // See comments in curve25519_32.h for which functions use these bounds for
89 // inputs or outputs.
90 #define assert_fe(f)                                                     \
91   do {                                                                   \
92     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
93       assert(f[_assert_fe_i] <=                                          \
94              ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
95     }                                                                    \
96   } while (0)
97 
98 // assert_fe_loose asserts that |f| satisfies bounds:
99 //
100 //  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
101 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
102 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
103 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
104 //   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
105 //
106 // See comments in curve25519_32.h for which functions use these bounds for
107 // inputs or outputs.
108 #define assert_fe_loose(f)                                               \
109   do {                                                                   \
110     for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
111       assert(f[_assert_fe_i] <=                                          \
112              ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
113     }                                                                    \
114   } while (0)
115 
116 //FIXME: use Zephyr macro
117 _Static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
118                "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
119 
fe_frombytes_strict(fe * h,const uint8_t s[32])120 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
121   // |fiat_25519_from_bytes| requires the top-most bit be clear.
122   assert((s[31] & 0x80) == 0);
123   fiat_25519_from_bytes(h->v, s);
124   assert_fe(h->v);
125 }
126 
fe_frombytes(fe * h,const uint8_t s[32])127 static void fe_frombytes(fe *h, const uint8_t s[32]) {
128   uint8_t s_copy[32];
129   memcpy(s_copy, s, 32);
130   s_copy[31] &= 0x7f;
131   fe_frombytes_strict(h, s_copy);
132 }
133 
fe_tobytes(uint8_t s[32],const fe * f)134 static void fe_tobytes(uint8_t s[32], const fe *f) {
135   assert_fe(f->v);
136   fiat_25519_to_bytes(s, f->v);
137 }
138 
139 // h = 0
fe_0(fe * h)140 static void fe_0(fe *h) {
141 #if defined(MCUBOOT_USE_MBED_TLS)
142   mbedtls_platform_zeroize(h, sizeof(fe));
143 #else
144   _set(h, 0, sizeof(fe));
145 #endif
146 }
147 
148 // h = 1
fe_1(fe * h)149 static void fe_1(fe *h) {
150 #if defined(MCUBOOT_USE_MBED_TLS)
151   mbedtls_platform_zeroize(h, sizeof(fe));
152 #else
153   _set(h, 0, sizeof(fe));
154 #endif
155   h->v[0] = 1;
156 }
157 
158 // h = f + g
159 // Can overlap h with f or g.
fe_add(fe_loose * h,const fe * f,const fe * g)160 static void fe_add(fe_loose *h, const fe *f, const fe *g) {
161   assert_fe(f->v);
162   assert_fe(g->v);
163   fiat_25519_add(h->v, f->v, g->v);
164   assert_fe_loose(h->v);
165 }
166 
167 // h = f - g
168 // Can overlap h with f or g.
fe_sub(fe_loose * h,const fe * f,const fe * g)169 static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
170   assert_fe(f->v);
171   assert_fe(g->v);
172   fiat_25519_sub(h->v, f->v, g->v);
173   assert_fe_loose(h->v);
174 }
175 
fe_carry(fe * h,const fe_loose * f)176 static void fe_carry(fe *h, const fe_loose* f) {
177   assert_fe_loose(f->v);
178   fiat_25519_carry(h->v, f->v);
179   assert_fe(h->v);
180 }
181 
fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],const fe_limb_t in1[FE_NUM_LIMBS],const fe_limb_t in2[FE_NUM_LIMBS])182 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
183                         const fe_limb_t in1[FE_NUM_LIMBS],
184                         const fe_limb_t in2[FE_NUM_LIMBS]) {
185   assert_fe_loose(in1);
186   assert_fe_loose(in2);
187   fiat_25519_carry_mul(out, in1, in2);
188   assert_fe(out);
189 }
190 
fe_mul_ltt(fe_loose * h,const fe * f,const fe * g)191 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
192   fe_mul_impl(h->v, f->v, g->v);
193 }
194 
fe_mul_ttt(fe * h,const fe * f,const fe * g)195 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
196   fe_mul_impl(h->v, f->v, g->v);
197 }
198 
fe_mul_tlt(fe * h,const fe_loose * f,const fe * g)199 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
200   fe_mul_impl(h->v, f->v, g->v);
201 }
202 
fe_mul_ttl(fe * h,const fe * f,const fe_loose * g)203 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
204   fe_mul_impl(h->v, f->v, g->v);
205 }
206 
fe_mul_tll(fe * h,const fe_loose * f,const fe_loose * g)207 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
208   fe_mul_impl(h->v, f->v, g->v);
209 }
210 
fe_sq_tl(fe * h,const fe_loose * f)211 static void fe_sq_tl(fe *h, const fe_loose *f) {
212   assert_fe_loose(f->v);
213   fiat_25519_carry_square(h->v, f->v);
214   assert_fe(h->v);
215 }
216 
fe_sq_tt(fe * h,const fe * f)217 static void fe_sq_tt(fe *h, const fe *f) {
218   assert_fe_loose(f->v);
219   fiat_25519_carry_square(h->v, f->v);
220   assert_fe(h->v);
221 }
222 
223 // h = -f
fe_neg(fe_loose * h,const fe * f)224 static void fe_neg(fe_loose *h, const fe *f) {
225   assert_fe(f->v);
226   fiat_25519_opp(h->v, f->v);
227   assert_fe_loose(h->v);
228 }
229 
230 // h = f
fe_copy(fe * h,const fe * f)231 static void fe_copy(fe *h, const fe *f) {
232   memmove(h, f, sizeof(fe));
233 }
234 
fe_copy_lt(fe_loose * h,const fe * f)235 static void fe_copy_lt(fe_loose *h, const fe *f) {
236   //FIXME: use Zephyr macro
237   _Static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
238   memmove(h, f, sizeof(fe));
239 }
240 
fe_loose_invert(fe * out,const fe_loose * z)241 static void fe_loose_invert(fe *out, const fe_loose *z) {
242   fe t0;
243   fe t1;
244   fe t2;
245   fe t3;
246   int i;
247 
248   fe_sq_tl(&t0, z);
249   fe_sq_tt(&t1, &t0);
250   for (i = 1; i < 2; ++i) {
251     fe_sq_tt(&t1, &t1);
252   }
253   fe_mul_tlt(&t1, z, &t1);
254   fe_mul_ttt(&t0, &t0, &t1);
255   fe_sq_tt(&t2, &t0);
256   fe_mul_ttt(&t1, &t1, &t2);
257   fe_sq_tt(&t2, &t1);
258   for (i = 1; i < 5; ++i) {
259     fe_sq_tt(&t2, &t2);
260   }
261   fe_mul_ttt(&t1, &t2, &t1);
262   fe_sq_tt(&t2, &t1);
263   for (i = 1; i < 10; ++i) {
264     fe_sq_tt(&t2, &t2);
265   }
266   fe_mul_ttt(&t2, &t2, &t1);
267   fe_sq_tt(&t3, &t2);
268   for (i = 1; i < 20; ++i) {
269     fe_sq_tt(&t3, &t3);
270   }
271   fe_mul_ttt(&t2, &t3, &t2);
272   fe_sq_tt(&t2, &t2);
273   for (i = 1; i < 10; ++i) {
274     fe_sq_tt(&t2, &t2);
275   }
276   fe_mul_ttt(&t1, &t2, &t1);
277   fe_sq_tt(&t2, &t1);
278   for (i = 1; i < 50; ++i) {
279     fe_sq_tt(&t2, &t2);
280   }
281   fe_mul_ttt(&t2, &t2, &t1);
282   fe_sq_tt(&t3, &t2);
283   for (i = 1; i < 100; ++i) {
284     fe_sq_tt(&t3, &t3);
285   }
286   fe_mul_ttt(&t2, &t3, &t2);
287   fe_sq_tt(&t2, &t2);
288   for (i = 1; i < 50; ++i) {
289     fe_sq_tt(&t2, &t2);
290   }
291   fe_mul_ttt(&t1, &t2, &t1);
292   fe_sq_tt(&t1, &t1);
293   for (i = 1; i < 5; ++i) {
294     fe_sq_tt(&t1, &t1);
295   }
296   fe_mul_ttt(out, &t1, &t0);
297 }
298 
fe_invert(fe * out,const fe * z)299 static void fe_invert(fe *out, const fe *z) {
300   fe_loose l;
301   fe_copy_lt(&l, z);
302   fe_loose_invert(out, &l);
303 }
304 
CRYPTO_memcmp(const void * in_a,const void * in_b,size_t len)305 static int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) {
306   const uint8_t *a = in_a;
307   const uint8_t *b = in_b;
308   uint8_t x = 0;
309 
310   for (size_t i = 0; i < len; i++) {
311     x |= a[i] ^ b[i];
312   }
313 
314   return x;
315 }
316 
317 // return 0 if f == 0
318 // return 1 if f != 0
fe_isnonzero(const fe_loose * f)319 static int fe_isnonzero(const fe_loose *f) {
320   fe tight;
321   fe_carry(&tight, f);
322   uint8_t s[32];
323   fe_tobytes(s, &tight);
324 
325   static const uint8_t zero[32] = {0};
326   return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
327 }
328 
329 // return 1 if f is in {1,3,5,...,q-2}
330 // return 0 if f is in {0,2,4,...,q-1}
fe_isnegative(const fe * f)331 static int fe_isnegative(const fe *f) {
332   uint8_t s[32];
333   fe_tobytes(s, f);
334   return s[0] & 1;
335 }
336 
fe_sq2_tt(fe * h,const fe * f)337 static void fe_sq2_tt(fe *h, const fe *f) {
338   // h = f^2
339   fe_sq_tt(h, f);
340 
341   // h = h + h
342   fe_loose tmp;
343   fe_add(&tmp, h, h);
344   fe_carry(h, &tmp);
345 }
346 
fe_pow22523(fe * out,const fe * z)347 static void fe_pow22523(fe *out, const fe *z) {
348   fe t0;
349   fe t1;
350   fe t2;
351   int i;
352 
353   fe_sq_tt(&t0, z);
354   fe_sq_tt(&t1, &t0);
355   for (i = 1; i < 2; ++i) {
356     fe_sq_tt(&t1, &t1);
357   }
358   fe_mul_ttt(&t1, z, &t1);
359   fe_mul_ttt(&t0, &t0, &t1);
360   fe_sq_tt(&t0, &t0);
361   fe_mul_ttt(&t0, &t1, &t0);
362   fe_sq_tt(&t1, &t0);
363   for (i = 1; i < 5; ++i) {
364     fe_sq_tt(&t1, &t1);
365   }
366   fe_mul_ttt(&t0, &t1, &t0);
367   fe_sq_tt(&t1, &t0);
368   for (i = 1; i < 10; ++i) {
369     fe_sq_tt(&t1, &t1);
370   }
371   fe_mul_ttt(&t1, &t1, &t0);
372   fe_sq_tt(&t2, &t1);
373   for (i = 1; i < 20; ++i) {
374     fe_sq_tt(&t2, &t2);
375   }
376   fe_mul_ttt(&t1, &t2, &t1);
377   fe_sq_tt(&t1, &t1);
378   for (i = 1; i < 10; ++i) {
379     fe_sq_tt(&t1, &t1);
380   }
381   fe_mul_ttt(&t0, &t1, &t0);
382   fe_sq_tt(&t1, &t0);
383   for (i = 1; i < 50; ++i) {
384     fe_sq_tt(&t1, &t1);
385   }
386   fe_mul_ttt(&t1, &t1, &t0);
387   fe_sq_tt(&t2, &t1);
388   for (i = 1; i < 100; ++i) {
389     fe_sq_tt(&t2, &t2);
390   }
391   fe_mul_ttt(&t1, &t2, &t1);
392   fe_sq_tt(&t1, &t1);
393   for (i = 1; i < 50; ++i) {
394     fe_sq_tt(&t1, &t1);
395   }
396   fe_mul_ttt(&t0, &t1, &t0);
397   fe_sq_tt(&t0, &t0);
398   for (i = 1; i < 2; ++i) {
399     fe_sq_tt(&t0, &t0);
400   }
401   fe_mul_ttt(out, &t0, z);
402 }
403 
404 
405 // Group operations.
406 
x25519_ge_tobytes(uint8_t s[32],const ge_p2 * h)407 void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
408   fe recip;
409   fe x;
410   fe y;
411 
412   fe_invert(&recip, &h->Z);
413   fe_mul_ttt(&x, &h->X, &recip);
414   fe_mul_ttt(&y, &h->Y, &recip);
415   fe_tobytes(s, &y);
416   s[31] ^= fe_isnegative(&x) << 7;
417 }
418 
x25519_ge_frombytes_vartime(ge_p3 * h,const uint8_t s[32])419 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
420   fe u;
421   fe_loose v;
422   fe v3;
423   fe vxx;
424   fe_loose check;
425 
426   fe_frombytes(&h->Y, s);
427   fe_1(&h->Z);
428   fe_sq_tt(&v3, &h->Y);
429   fe_mul_ttt(&vxx, &v3, &d);
430   fe_sub(&v, &v3, &h->Z);  // u = y^2-1
431   fe_carry(&u, &v);
432   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
433 
434   fe_sq_tl(&v3, &v);
435   fe_mul_ttl(&v3, &v3, &v);  // v3 = v^3
436   fe_sq_tt(&h->X, &v3);
437   fe_mul_ttl(&h->X, &h->X, &v);
438   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^7
439 
440   fe_pow22523(&h->X, &h->X);  // x = (uv^7)^((q-5)/8)
441   fe_mul_ttt(&h->X, &h->X, &v3);
442   fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^3(uv^7)^((q-5)/8)
443 
444   fe_sq_tt(&vxx, &h->X);
445   fe_mul_ttl(&vxx, &vxx, &v);
446   fe_sub(&check, &vxx, &u);
447   if (fe_isnonzero(&check)) {
448     fe_add(&check, &vxx, &u);
449     if (fe_isnonzero(&check)) {
450       return 0;
451     }
452     fe_mul_ttt(&h->X, &h->X, &sqrtm1);
453   }
454 
455   if (fe_isnegative(&h->X) != (s[31] >> 7)) {
456     fe_loose t;
457     fe_neg(&t, &h->X);
458     fe_carry(&h->X, &t);
459   }
460 
461   fe_mul_ttt(&h->T, &h->X, &h->Y);
462   return 1;
463 }
464 
ge_p2_0(ge_p2 * h)465 static void ge_p2_0(ge_p2 *h) {
466   fe_0(&h->X);
467   fe_1(&h->Y);
468   fe_1(&h->Z);
469 }
470 
471 // r = p
ge_p3_to_p2(ge_p2 * r,const ge_p3 * p)472 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
473   fe_copy(&r->X, &p->X);
474   fe_copy(&r->Y, &p->Y);
475   fe_copy(&r->Z, &p->Z);
476 }
477 
478 // r = p
x25519_ge_p3_to_cached(ge_cached * r,const ge_p3 * p)479 void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
480   fe_add(&r->YplusX, &p->Y, &p->X);
481   fe_sub(&r->YminusX, &p->Y, &p->X);
482   fe_copy_lt(&r->Z, &p->Z);
483   fe_mul_ltt(&r->T2d, &p->T, &d2);
484 }
485 
486 // r = p
x25519_ge_p1p1_to_p2(ge_p2 * r,const ge_p1p1 * p)487 void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
488   fe_mul_tll(&r->X, &p->X, &p->T);
489   fe_mul_tll(&r->Y, &p->Y, &p->Z);
490   fe_mul_tll(&r->Z, &p->Z, &p->T);
491 }
492 
493 // r = p
x25519_ge_p1p1_to_p3(ge_p3 * r,const ge_p1p1 * p)494 void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
495   fe_mul_tll(&r->X, &p->X, &p->T);
496   fe_mul_tll(&r->Y, &p->Y, &p->Z);
497   fe_mul_tll(&r->Z, &p->Z, &p->T);
498   fe_mul_tll(&r->T, &p->X, &p->Y);
499 }
500 
501 // r = 2 * p
ge_p2_dbl(ge_p1p1 * r,const ge_p2 * p)502 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
503   fe trX, trZ, trT;
504   fe t0;
505 
506   fe_sq_tt(&trX, &p->X);
507   fe_sq_tt(&trZ, &p->Y);
508   fe_sq2_tt(&trT, &p->Z);
509   fe_add(&r->Y, &p->X, &p->Y);
510   fe_sq_tl(&t0, &r->Y);
511 
512   fe_add(&r->Y, &trZ, &trX);
513   fe_sub(&r->Z, &trZ, &trX);
514   fe_carry(&trZ, &r->Y);
515   fe_sub(&r->X, &t0, &trZ);
516   fe_carry(&trZ, &r->Z);
517   fe_sub(&r->T, &trT, &trZ);
518 }
519 
520 // r = 2 * p
ge_p3_dbl(ge_p1p1 * r,const ge_p3 * p)521 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
522   ge_p2 q;
523   ge_p3_to_p2(&q, p);
524   ge_p2_dbl(r, &q);
525 }
526 
527 // r = p + q
ge_madd(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)528 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
529   fe trY, trZ, trT;
530 
531   fe_add(&r->X, &p->Y, &p->X);
532   fe_sub(&r->Y, &p->Y, &p->X);
533   fe_mul_tll(&trZ, &r->X, &q->yplusx);
534   fe_mul_tll(&trY, &r->Y, &q->yminusx);
535   fe_mul_tlt(&trT, &q->xy2d, &p->T);
536   fe_add(&r->T, &p->Z, &p->Z);
537   fe_sub(&r->X, &trZ, &trY);
538   fe_add(&r->Y, &trZ, &trY);
539   fe_carry(&trZ, &r->T);
540   fe_add(&r->Z, &trZ, &trT);
541   fe_sub(&r->T, &trZ, &trT);
542 }
543 
544 // r = p - q
ge_msub(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)545 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
546   fe trY, trZ, trT;
547 
548   fe_add(&r->X, &p->Y, &p->X);
549   fe_sub(&r->Y, &p->Y, &p->X);
550   fe_mul_tll(&trZ, &r->X, &q->yminusx);
551   fe_mul_tll(&trY, &r->Y, &q->yplusx);
552   fe_mul_tlt(&trT, &q->xy2d, &p->T);
553   fe_add(&r->T, &p->Z, &p->Z);
554   fe_sub(&r->X, &trZ, &trY);
555   fe_add(&r->Y, &trZ, &trY);
556   fe_carry(&trZ, &r->T);
557   fe_sub(&r->Z, &trZ, &trT);
558   fe_add(&r->T, &trZ, &trT);
559 }
560 
561 // r = p + q
x25519_ge_add(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)562 void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
563   fe trX, trY, trZ, trT;
564 
565   fe_add(&r->X, &p->Y, &p->X);
566   fe_sub(&r->Y, &p->Y, &p->X);
567   fe_mul_tll(&trZ, &r->X, &q->YplusX);
568   fe_mul_tll(&trY, &r->Y, &q->YminusX);
569   fe_mul_tlt(&trT, &q->T2d, &p->T);
570   fe_mul_ttl(&trX, &p->Z, &q->Z);
571   fe_add(&r->T, &trX, &trX);
572   fe_sub(&r->X, &trZ, &trY);
573   fe_add(&r->Y, &trZ, &trY);
574   fe_carry(&trZ, &r->T);
575   fe_add(&r->Z, &trZ, &trT);
576   fe_sub(&r->T, &trZ, &trT);
577 }
578 
579 // r = p - q
x25519_ge_sub(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)580 void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
581   fe trX, trY, trZ, trT;
582 
583   fe_add(&r->X, &p->Y, &p->X);
584   fe_sub(&r->Y, &p->Y, &p->X);
585   fe_mul_tll(&trZ, &r->X, &q->YminusX);
586   fe_mul_tll(&trY, &r->Y, &q->YplusX);
587   fe_mul_tlt(&trT, &q->T2d, &p->T);
588   fe_mul_ttl(&trX, &p->Z, &q->Z);
589   fe_add(&r->T, &trX, &trX);
590   fe_sub(&r->X, &trZ, &trY);
591   fe_add(&r->Y, &trZ, &trY);
592   fe_carry(&trZ, &r->T);
593   fe_sub(&r->Z, &trZ, &trT);
594   fe_add(&r->T, &trZ, &trT);
595 }
596 
slide(signed char * r,const uint8_t * a)597 static void slide(signed char *r, const uint8_t *a) {
598   int i;
599   int b;
600   int k;
601 
602   for (i = 0; i < 256; ++i) {
603     r[i] = 1 & (a[i >> 3] >> (i & 7));
604   }
605 
606   for (i = 0; i < 256; ++i) {
607     if (r[i]) {
608       for (b = 1; b <= 6 && i + b < 256; ++b) {
609         if (r[i + b]) {
610           if (r[i] + (r[i + b] << b) <= 15) {
611             r[i] += r[i + b] << b;
612             r[i + b] = 0;
613           } else if (r[i] - (r[i + b] << b) >= -15) {
614             r[i] -= r[i + b] << b;
615             for (k = i + b; k < 256; ++k) {
616               if (!r[k]) {
617                 r[k] = 1;
618                 break;
619               }
620               r[k] = 0;
621             }
622           } else {
623             break;
624           }
625         }
626       }
627     }
628   }
629 }
630 
631 // r = a * A + b * B
632 // where a = a[0]+256*a[1]+...+256^31 a[31].
633 // and b = b[0]+256*b[1]+...+256^31 b[31].
634 // B is the Ed25519 base point (x,4/5) with x positive.
ge_double_scalarmult_vartime(ge_p2 * r,const uint8_t * a,const ge_p3 * A,const uint8_t * b)635 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
636                                          const ge_p3 *A, const uint8_t *b) {
637   signed char aslide[256];
638   signed char bslide[256];
639   ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
640   ge_p1p1 t;
641   ge_p3 u;
642   ge_p3 A2;
643   int i;
644 
645   slide(aslide, a);
646   slide(bslide, b);
647 
648   x25519_ge_p3_to_cached(&Ai[0], A);
649   ge_p3_dbl(&t, A);
650   x25519_ge_p1p1_to_p3(&A2, &t);
651   x25519_ge_add(&t, &A2, &Ai[0]);
652   x25519_ge_p1p1_to_p3(&u, &t);
653   x25519_ge_p3_to_cached(&Ai[1], &u);
654   x25519_ge_add(&t, &A2, &Ai[1]);
655   x25519_ge_p1p1_to_p3(&u, &t);
656   x25519_ge_p3_to_cached(&Ai[2], &u);
657   x25519_ge_add(&t, &A2, &Ai[2]);
658   x25519_ge_p1p1_to_p3(&u, &t);
659   x25519_ge_p3_to_cached(&Ai[3], &u);
660   x25519_ge_add(&t, &A2, &Ai[3]);
661   x25519_ge_p1p1_to_p3(&u, &t);
662   x25519_ge_p3_to_cached(&Ai[4], &u);
663   x25519_ge_add(&t, &A2, &Ai[4]);
664   x25519_ge_p1p1_to_p3(&u, &t);
665   x25519_ge_p3_to_cached(&Ai[5], &u);
666   x25519_ge_add(&t, &A2, &Ai[5]);
667   x25519_ge_p1p1_to_p3(&u, &t);
668   x25519_ge_p3_to_cached(&Ai[6], &u);
669   x25519_ge_add(&t, &A2, &Ai[6]);
670   x25519_ge_p1p1_to_p3(&u, &t);
671   x25519_ge_p3_to_cached(&Ai[7], &u);
672 
673   ge_p2_0(r);
674 
675   for (i = 255; i >= 0; --i) {
676     if (aslide[i] || bslide[i]) {
677       break;
678     }
679   }
680 
681   for (; i >= 0; --i) {
682     ge_p2_dbl(&t, r);
683 
684     if (aslide[i] > 0) {
685       x25519_ge_p1p1_to_p3(&u, &t);
686       x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
687     } else if (aslide[i] < 0) {
688       x25519_ge_p1p1_to_p3(&u, &t);
689       x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
690     }
691 
692     if (bslide[i] > 0) {
693       x25519_ge_p1p1_to_p3(&u, &t);
694       ge_madd(&t, &u, &Bi[bslide[i] / 2]);
695     } else if (bslide[i] < 0) {
696       x25519_ge_p1p1_to_p3(&u, &t);
697       ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
698     }
699 
700     x25519_ge_p1p1_to_p2(r, &t);
701   }
702 }
703 
704 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
705 // sign bit. This works around a language flaw in C.
int64_lshift21(int64_t a)706 static inline int64_t int64_lshift21(int64_t a) {
707   return (int64_t)((uint64_t)a << 21);
708 }
709 
710 // The set of scalars is \Z/l
711 // where l = 2^252 + 27742317777372353535851937790883648493.
712 
713 // Input:
714 //   s[0]+256*s[1]+...+256^63*s[63] = s
715 //
716 // Output:
717 //   s[0]+256*s[1]+...+256^31*s[31] = s mod l
718 //   where l = 2^252 + 27742317777372353535851937790883648493.
719 //   Overwrites s in place.
x25519_sc_reduce(uint8_t s[64])720 void x25519_sc_reduce(uint8_t s[64]) {
721   int64_t s0 = 2097151 & load_3(s);
722   int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
723   int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
724   int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
725   int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
726   int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
727   int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
728   int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
729   int64_t s8 = 2097151 & load_3(s + 21);
730   int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
731   int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
732   int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
733   int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
734   int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
735   int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
736   int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
737   int64_t s16 = 2097151 & load_3(s + 42);
738   int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
739   int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
740   int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
741   int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
742   int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
743   int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
744   int64_t s23 = (load_4(s + 60) >> 3);
745   int64_t carry0;
746   int64_t carry1;
747   int64_t carry2;
748   int64_t carry3;
749   int64_t carry4;
750   int64_t carry5;
751   int64_t carry6;
752   int64_t carry7;
753   int64_t carry8;
754   int64_t carry9;
755   int64_t carry10;
756   int64_t carry11;
757   int64_t carry12;
758   int64_t carry13;
759   int64_t carry14;
760   int64_t carry15;
761   int64_t carry16;
762 
763   s11 += s23 * 666643;
764   s12 += s23 * 470296;
765   s13 += s23 * 654183;
766   s14 -= s23 * 997805;
767   s15 += s23 * 136657;
768   s16 -= s23 * 683901;
769   s23 = 0;
770 
771   s10 += s22 * 666643;
772   s11 += s22 * 470296;
773   s12 += s22 * 654183;
774   s13 -= s22 * 997805;
775   s14 += s22 * 136657;
776   s15 -= s22 * 683901;
777   s22 = 0;
778 
779   s9 += s21 * 666643;
780   s10 += s21 * 470296;
781   s11 += s21 * 654183;
782   s12 -= s21 * 997805;
783   s13 += s21 * 136657;
784   s14 -= s21 * 683901;
785   s21 = 0;
786 
787   s8 += s20 * 666643;
788   s9 += s20 * 470296;
789   s10 += s20 * 654183;
790   s11 -= s20 * 997805;
791   s12 += s20 * 136657;
792   s13 -= s20 * 683901;
793   s20 = 0;
794 
795   s7 += s19 * 666643;
796   s8 += s19 * 470296;
797   s9 += s19 * 654183;
798   s10 -= s19 * 997805;
799   s11 += s19 * 136657;
800   s12 -= s19 * 683901;
801   s19 = 0;
802 
803   s6 += s18 * 666643;
804   s7 += s18 * 470296;
805   s8 += s18 * 654183;
806   s9 -= s18 * 997805;
807   s10 += s18 * 136657;
808   s11 -= s18 * 683901;
809   s18 = 0;
810 
811   carry6 = (s6 + (1 << 20)) >> 21;
812   s7 += carry6;
813   s6 -= int64_lshift21(carry6);
814   carry8 = (s8 + (1 << 20)) >> 21;
815   s9 += carry8;
816   s8 -= int64_lshift21(carry8);
817   carry10 = (s10 + (1 << 20)) >> 21;
818   s11 += carry10;
819   s10 -= int64_lshift21(carry10);
820   carry12 = (s12 + (1 << 20)) >> 21;
821   s13 += carry12;
822   s12 -= int64_lshift21(carry12);
823   carry14 = (s14 + (1 << 20)) >> 21;
824   s15 += carry14;
825   s14 -= int64_lshift21(carry14);
826   carry16 = (s16 + (1 << 20)) >> 21;
827   s17 += carry16;
828   s16 -= int64_lshift21(carry16);
829 
830   carry7 = (s7 + (1 << 20)) >> 21;
831   s8 += carry7;
832   s7 -= int64_lshift21(carry7);
833   carry9 = (s9 + (1 << 20)) >> 21;
834   s10 += carry9;
835   s9 -= int64_lshift21(carry9);
836   carry11 = (s11 + (1 << 20)) >> 21;
837   s12 += carry11;
838   s11 -= int64_lshift21(carry11);
839   carry13 = (s13 + (1 << 20)) >> 21;
840   s14 += carry13;
841   s13 -= int64_lshift21(carry13);
842   carry15 = (s15 + (1 << 20)) >> 21;
843   s16 += carry15;
844   s15 -= int64_lshift21(carry15);
845 
846   s5 += s17 * 666643;
847   s6 += s17 * 470296;
848   s7 += s17 * 654183;
849   s8 -= s17 * 997805;
850   s9 += s17 * 136657;
851   s10 -= s17 * 683901;
852   s17 = 0;
853 
854   s4 += s16 * 666643;
855   s5 += s16 * 470296;
856   s6 += s16 * 654183;
857   s7 -= s16 * 997805;
858   s8 += s16 * 136657;
859   s9 -= s16 * 683901;
860   s16 = 0;
861 
862   s3 += s15 * 666643;
863   s4 += s15 * 470296;
864   s5 += s15 * 654183;
865   s6 -= s15 * 997805;
866   s7 += s15 * 136657;
867   s8 -= s15 * 683901;
868   s15 = 0;
869 
870   s2 += s14 * 666643;
871   s3 += s14 * 470296;
872   s4 += s14 * 654183;
873   s5 -= s14 * 997805;
874   s6 += s14 * 136657;
875   s7 -= s14 * 683901;
876   s14 = 0;
877 
878   s1 += s13 * 666643;
879   s2 += s13 * 470296;
880   s3 += s13 * 654183;
881   s4 -= s13 * 997805;
882   s5 += s13 * 136657;
883   s6 -= s13 * 683901;
884   s13 = 0;
885 
886   s0 += s12 * 666643;
887   s1 += s12 * 470296;
888   s2 += s12 * 654183;
889   s3 -= s12 * 997805;
890   s4 += s12 * 136657;
891   s5 -= s12 * 683901;
892   s12 = 0;
893 
894   carry0 = (s0 + (1 << 20)) >> 21;
895   s1 += carry0;
896   s0 -= int64_lshift21(carry0);
897   carry2 = (s2 + (1 << 20)) >> 21;
898   s3 += carry2;
899   s2 -= int64_lshift21(carry2);
900   carry4 = (s4 + (1 << 20)) >> 21;
901   s5 += carry4;
902   s4 -= int64_lshift21(carry4);
903   carry6 = (s6 + (1 << 20)) >> 21;
904   s7 += carry6;
905   s6 -= int64_lshift21(carry6);
906   carry8 = (s8 + (1 << 20)) >> 21;
907   s9 += carry8;
908   s8 -= int64_lshift21(carry8);
909   carry10 = (s10 + (1 << 20)) >> 21;
910   s11 += carry10;
911   s10 -= int64_lshift21(carry10);
912 
913   carry1 = (s1 + (1 << 20)) >> 21;
914   s2 += carry1;
915   s1 -= int64_lshift21(carry1);
916   carry3 = (s3 + (1 << 20)) >> 21;
917   s4 += carry3;
918   s3 -= int64_lshift21(carry3);
919   carry5 = (s5 + (1 << 20)) >> 21;
920   s6 += carry5;
921   s5 -= int64_lshift21(carry5);
922   carry7 = (s7 + (1 << 20)) >> 21;
923   s8 += carry7;
924   s7 -= int64_lshift21(carry7);
925   carry9 = (s9 + (1 << 20)) >> 21;
926   s10 += carry9;
927   s9 -= int64_lshift21(carry9);
928   carry11 = (s11 + (1 << 20)) >> 21;
929   s12 += carry11;
930   s11 -= int64_lshift21(carry11);
931 
932   s0 += s12 * 666643;
933   s1 += s12 * 470296;
934   s2 += s12 * 654183;
935   s3 -= s12 * 997805;
936   s4 += s12 * 136657;
937   s5 -= s12 * 683901;
938   s12 = 0;
939 
940   carry0 = s0 >> 21;
941   s1 += carry0;
942   s0 -= int64_lshift21(carry0);
943   carry1 = s1 >> 21;
944   s2 += carry1;
945   s1 -= int64_lshift21(carry1);
946   carry2 = s2 >> 21;
947   s3 += carry2;
948   s2 -= int64_lshift21(carry2);
949   carry3 = s3 >> 21;
950   s4 += carry3;
951   s3 -= int64_lshift21(carry3);
952   carry4 = s4 >> 21;
953   s5 += carry4;
954   s4 -= int64_lshift21(carry4);
955   carry5 = s5 >> 21;
956   s6 += carry5;
957   s5 -= int64_lshift21(carry5);
958   carry6 = s6 >> 21;
959   s7 += carry6;
960   s6 -= int64_lshift21(carry6);
961   carry7 = s7 >> 21;
962   s8 += carry7;
963   s7 -= int64_lshift21(carry7);
964   carry8 = s8 >> 21;
965   s9 += carry8;
966   s8 -= int64_lshift21(carry8);
967   carry9 = s9 >> 21;
968   s10 += carry9;
969   s9 -= int64_lshift21(carry9);
970   carry10 = s10 >> 21;
971   s11 += carry10;
972   s10 -= int64_lshift21(carry10);
973   carry11 = s11 >> 21;
974   s12 += carry11;
975   s11 -= int64_lshift21(carry11);
976 
977   s0 += s12 * 666643;
978   s1 += s12 * 470296;
979   s2 += s12 * 654183;
980   s3 -= s12 * 997805;
981   s4 += s12 * 136657;
982   s5 -= s12 * 683901;
983   s12 = 0;
984 
985   carry0 = s0 >> 21;
986   s1 += carry0;
987   s0 -= int64_lshift21(carry0);
988   carry1 = s1 >> 21;
989   s2 += carry1;
990   s1 -= int64_lshift21(carry1);
991   carry2 = s2 >> 21;
992   s3 += carry2;
993   s2 -= int64_lshift21(carry2);
994   carry3 = s3 >> 21;
995   s4 += carry3;
996   s3 -= int64_lshift21(carry3);
997   carry4 = s4 >> 21;
998   s5 += carry4;
999   s4 -= int64_lshift21(carry4);
1000   carry5 = s5 >> 21;
1001   s6 += carry5;
1002   s5 -= int64_lshift21(carry5);
1003   carry6 = s6 >> 21;
1004   s7 += carry6;
1005   s6 -= int64_lshift21(carry6);
1006   carry7 = s7 >> 21;
1007   s8 += carry7;
1008   s7 -= int64_lshift21(carry7);
1009   carry8 = s8 >> 21;
1010   s9 += carry8;
1011   s8 -= int64_lshift21(carry8);
1012   carry9 = s9 >> 21;
1013   s10 += carry9;
1014   s9 -= int64_lshift21(carry9);
1015   carry10 = s10 >> 21;
1016   s11 += carry10;
1017   s10 -= int64_lshift21(carry10);
1018 
1019   s[0] = s0 >> 0;
1020   s[1] = s0 >> 8;
1021   s[2] = (s0 >> 16) | (s1 << 5);
1022   s[3] = s1 >> 3;
1023   s[4] = s1 >> 11;
1024   s[5] = (s1 >> 19) | (s2 << 2);
1025   s[6] = s2 >> 6;
1026   s[7] = (s2 >> 14) | (s3 << 7);
1027   s[8] = s3 >> 1;
1028   s[9] = s3 >> 9;
1029   s[10] = (s3 >> 17) | (s4 << 4);
1030   s[11] = s4 >> 4;
1031   s[12] = s4 >> 12;
1032   s[13] = (s4 >> 20) | (s5 << 1);
1033   s[14] = s5 >> 7;
1034   s[15] = (s5 >> 15) | (s6 << 6);
1035   s[16] = s6 >> 2;
1036   s[17] = s6 >> 10;
1037   s[18] = (s6 >> 18) | (s7 << 3);
1038   s[19] = s7 >> 5;
1039   s[20] = s7 >> 13;
1040   s[21] = s8 >> 0;
1041   s[22] = s8 >> 8;
1042   s[23] = (s8 >> 16) | (s9 << 5);
1043   s[24] = s9 >> 3;
1044   s[25] = s9 >> 11;
1045   s[26] = (s9 >> 19) | (s10 << 2);
1046   s[27] = s10 >> 6;
1047   s[28] = (s10 >> 14) | (s11 << 7);
1048   s[29] = s11 >> 1;
1049   s[30] = s11 >> 9;
1050   s[31] = s11 >> 17;
1051 }
1052 
ED25519_verify(const uint8_t * message,size_t message_len,const uint8_t signature[64],const uint8_t public_key[32])1053 int ED25519_verify(const uint8_t *message, size_t message_len,
1054                    const uint8_t signature[64], const uint8_t public_key[32]) {
1055   ge_p3 A;
1056   if ((signature[63] & 224) != 0 ||
1057       !x25519_ge_frombytes_vartime(&A, public_key)) {
1058     return 0;
1059   }
1060 
1061   fe_loose t;
1062   fe_neg(&t, &A.X);
1063   fe_carry(&A.X, &t);
1064   fe_neg(&t, &A.T);
1065   fe_carry(&A.T, &t);
1066 
1067   uint8_t pkcopy[32];
1068   memcpy(pkcopy, public_key, 32);
1069   uint8_t rcopy[32];
1070   memcpy(rcopy, signature, 32);
1071   union {
1072     uint64_t u64[4];
1073     uint8_t u8[32];
1074   } scopy;
1075   memcpy(&scopy.u8[0], signature + 32, 32);
1076 
1077   // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
1078   // the range [0, order) in order to prevent signature malleability.
1079 
1080   // kOrder is the order of Curve25519 in little-endian form.
1081   static const uint64_t kOrder[4] = {
1082     UINT64_C(0x5812631a5cf5d3ed),
1083     UINT64_C(0x14def9dea2f79cd6),
1084     0,
1085     UINT64_C(0x1000000000000000),
1086   };
1087   for (size_t i = 3;; i--) {
1088     if (scopy.u64[i] > kOrder[i]) {
1089       return 0;
1090     } else if (scopy.u64[i] < kOrder[i]) {
1091       break;
1092     } else if (i == 0) {
1093       return 0;
1094     }
1095   }
1096 
1097 #if defined(MCUBOOT_USE_MBED_TLS)
1098 
1099   mbedtls_sha512_context ctx;
1100   int ret;
1101 
1102   mbedtls_sha512_init(&ctx);
1103 
1104   ret = mbedtls_sha512_starts_ret(&ctx, 0);
1105   assert(ret == 0);
1106 
1107   ret = mbedtls_sha512_update_ret(&ctx, signature, 32);
1108   assert(ret == 0);
1109   ret = mbedtls_sha512_update_ret(&ctx, public_key, 32);
1110   assert(ret == 0);
1111   ret = mbedtls_sha512_update_ret(&ctx, message, message_len);
1112   assert(ret == 0);
1113 
1114   uint8_t h[SHA512_DIGEST_LENGTH];
1115   ret = mbedtls_sha512_finish_ret(&ctx, h);
1116   assert(ret == 0);
1117   mbedtls_sha512_free(&ctx);
1118 
1119 #else
1120 
1121   struct tc_sha512_state_struct s;
1122   int rc;
1123 
1124   rc = tc_sha512_init(&s);
1125   assert(rc == TC_CRYPTO_SUCCESS);
1126 
1127   rc = tc_sha512_update(&s, signature, 32);
1128   assert(rc == TC_CRYPTO_SUCCESS);
1129   rc = tc_sha512_update(&s, public_key, 32);
1130   assert(rc == TC_CRYPTO_SUCCESS);
1131   rc = tc_sha512_update(&s, message, message_len);
1132   assert(rc == TC_CRYPTO_SUCCESS);
1133 
1134   uint8_t h[TC_SHA512_DIGEST_SIZE];
1135   rc = tc_sha512_final(h, &s);
1136   assert(rc == TC_CRYPTO_SUCCESS);
1137 
1138 #endif
1139 
1140   x25519_sc_reduce(h);
1141 
1142   ge_p2 R;
1143   ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
1144 
1145   uint8_t rcheck[32];
1146   x25519_ge_tobytes(rcheck, &R);
1147 
1148   return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
1149 }
1150 
fe_cswap(fe * f,fe * g,fe_limb_t b)1151 static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
1152   b = 0-b;
1153   for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
1154     fe_limb_t x = f->v[i] ^ g->v[i];
1155     x &= b;
1156     f->v[i] ^= x;
1157     g->v[i] ^= x;
1158   }
1159 }
1160 
fiat_25519_carry_scmul_121666(uint32_t out1[10],const uint32_t arg1[10])1161 static void fiat_25519_carry_scmul_121666(uint32_t out1[10], const uint32_t arg1[10]) {
1162   uint64_t x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9]));
1163   uint64_t x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8]));
1164   uint64_t x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7]));
1165   uint64_t x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6]));
1166   uint64_t x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5]));
1167   uint64_t x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4]));
1168   uint64_t x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3]));
1169   uint64_t x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2]));
1170   uint64_t x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1]));
1171   uint64_t x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0]));
1172   uint32_t x11 = (uint32_t)(x10 >> 26);
1173   uint32_t x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff));
1174   uint64_t x13 = (x11 + x9);
1175   uint32_t x14 = (uint32_t)(x13 >> 25);
1176   uint32_t x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff));
1177   uint64_t x16 = (x14 + x8);
1178   uint32_t x17 = (uint32_t)(x16 >> 26);
1179   uint32_t x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff));
1180   uint64_t x19 = (x17 + x7);
1181   uint32_t x20 = (uint32_t)(x19 >> 25);
1182   uint32_t x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff));
1183   uint64_t x22 = (x20 + x6);
1184   uint32_t x23 = (uint32_t)(x22 >> 26);
1185   uint32_t x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff));
1186   uint64_t x25 = (x23 + x5);
1187   uint32_t x26 = (uint32_t)(x25 >> 25);
1188   uint32_t x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff));
1189   uint64_t x28 = (x26 + x4);
1190   uint32_t x29 = (uint32_t)(x28 >> 26);
1191   uint32_t x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff));
1192   uint64_t x31 = (x29 + x3);
1193   uint32_t x32 = (uint32_t)(x31 >> 25);
1194   uint32_t x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff));
1195   uint64_t x34 = (x32 + x2);
1196   uint32_t x35 = (uint32_t)(x34 >> 26);
1197   uint32_t x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff));
1198   uint64_t x37 = (x35 + x1);
1199   uint32_t x38 = (uint32_t)(x37 >> 25);
1200   uint32_t x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff));
1201   uint32_t x40 = (x38 * (uint32_t)UINT8_C(0x13));
1202   uint32_t x41 = (x12 + x40);
1203   uint32_t x42 = (x41 >> 26);
1204   uint32_t x43 = (x41 & UINT32_C(0x3ffffff));
1205   uint32_t x44 = (x42 + x15);
1206   uint32_t x45 = (x44 >> 25);
1207   uint32_t x46 = (x44 & UINT32_C(0x1ffffff));
1208   uint32_t x47 = (x45 + x18);
1209   out1[0] = x43;
1210   out1[1] = x46;
1211   out1[2] = x47;
1212   out1[3] = x21;
1213   out1[4] = x24;
1214   out1[5] = x27;
1215   out1[6] = x30;
1216   out1[7] = x33;
1217   out1[8] = x36;
1218   out1[9] = x39;
1219 }
1220 
fe_mul121666(fe * h,const fe_loose * f)1221 static void fe_mul121666(fe *h, const fe_loose *f) {
1222   assert_fe_loose(f->v);
1223   fiat_25519_carry_scmul_121666(h->v, f->v);
1224   assert_fe(h->v);
1225 }
1226 
x25519_scalar_mult_generic(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])1227 static void x25519_scalar_mult_generic(uint8_t out[32],
1228                                        const uint8_t scalar[32],
1229                                        const uint8_t point[32]) {
1230   fe x1, x2, z2, x3, z3, tmp0, tmp1;
1231   fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
1232 
1233   uint8_t e[32];
1234   memcpy(e, scalar, 32);
1235   e[0] &= 248;
1236   e[31] &= 127;
1237   e[31] |= 64;
1238 
1239   // The following implementation was transcribed to Coq and proven to
1240   // correspond to unary scalar multiplication in affine coordinates given that
1241   // x1 != 0 is the x coordinate of some point on the curve. It was also checked
1242   // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
1243   // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
1244   // underlying field, so it applies to Curve25519 itself and the quadratic
1245   // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
1246   // correctly simulates extension-field arithmetic on prime-field values.
1247   // The decoding of the byte array representation of e was not considered.
1248   // Specification of Montgomery curves in affine coordinates:
1249   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
1250   // Proof that these form a group that is isomorphic to a Weierstrass curve:
1251   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
1252   // Coq transcription and correctness proof of the loop (where scalarbits=255):
1253   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
1254   // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
1255   // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
1256   fe_frombytes(&x1, point);
1257   fe_1(&x2);
1258   fe_0(&z2);
1259   fe_copy(&x3, &x1);
1260   fe_1(&z3);
1261 
1262   unsigned swap = 0;
1263   int pos;
1264   for (pos = 254; pos >= 0; --pos) {
1265     // loop invariant as of right before the test, for the case where x1 != 0:
1266     //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
1267     //   let r := e >> (pos+1) in the following equalities of projective points:
1268     //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
1269     //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
1270     //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
1271     unsigned b = 1 & (e[pos / 8] >> (pos & 7));
1272     swap ^= b;
1273     fe_cswap(&x2, &x3, swap);
1274     fe_cswap(&z2, &z3, swap);
1275     swap = b;
1276     // Coq transcription of ladderstep formula (called from transcribed loop):
1277     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
1278     // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
1279     // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
1280     // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
1281     fe_sub(&tmp0l, &x3, &z3);
1282     fe_sub(&tmp1l, &x2, &z2);
1283     fe_add(&x2l, &x2, &z2);
1284     fe_add(&z2l, &x3, &z3);
1285     fe_mul_tll(&z3, &tmp0l, &x2l);
1286     fe_mul_tll(&z2, &z2l, &tmp1l);
1287     fe_sq_tl(&tmp0, &tmp1l);
1288     fe_sq_tl(&tmp1, &x2l);
1289     fe_add(&x3l, &z3, &z2);
1290     fe_sub(&z2l, &z3, &z2);
1291     fe_mul_ttt(&x2, &tmp1, &tmp0);
1292     fe_sub(&tmp1l, &tmp1, &tmp0);
1293     fe_sq_tl(&z2, &z2l);
1294     fe_mul121666(&z3, &tmp1l);
1295     fe_sq_tl(&x3, &x3l);
1296     fe_add(&tmp0l, &tmp0, &z3);
1297     fe_mul_ttt(&z3, &x1, &z2);
1298     fe_mul_tll(&z2, &tmp1l, &tmp0l);
1299   }
1300   // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
1301   fe_cswap(&x2, &x3, swap);
1302   fe_cswap(&z2, &z3, swap);
1303 
1304   fe_invert(&z2, &z2);
1305   fe_mul_ttt(&x2, &x2, &z2);
1306   fe_tobytes(out, &x2);
1307 }
1308 
X25519(uint8_t out_shared_key[32],const uint8_t private_key[32],const uint8_t peer_public_value[32])1309 int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
1310            const uint8_t peer_public_value[32]) {
1311   static const uint8_t kZeros[32] = {0};
1312   x25519_scalar_mult_generic(out_shared_key, private_key, peer_public_value);
1313   // The all-zero output results when the input is a point of small order.
1314   return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
1315 }
1316