1 /**
2 * \brief Multi-precision integer library, ESP32 hardware accelerated parts
3 *
4 * based on mbedTLS implementation
5 *
6 * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
7 * Additions Copyright (C) 2016, Espressif Systems (Shanghai) PTE Ltd
8 * SPDX-License-Identifier: Apache-2.0
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License"); you may
11 * not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 *
14 * http://www.apache.org/licenses/LICENSE-2.0
15 *
16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
18 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and
20 * limitations under the License.
21 *
22 */
23 #include <stdio.h>
24 #include <string.h>
25 #include <malloc.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <stdlib.h>
29 #include <sys/param.h>
30 #include "soc/hwcrypto_periph.h"
31 #include "esp_system.h"
32 #include "esp_log.h"
33 #include "esp_attr.h"
34 #include "bignum_impl.h"
35 #include "soc/soc_caps.h"
36
37 #include <mbedtls/bignum.h>
38
39
40 /* Some implementation notes:
41 *
42 * - Naming convention x_words, y_words, z_words for number of words (limbs) used in a particular
43 * bignum. This number may be less than the size of the bignum
44 *
45 * - Naming convention hw_words for the hardware length of the operation. This number maybe be rounded up
46 * for targets that requres this (e.g. ESP32), and may be larger than any of the numbers
47 * involved in the calculation.
48 *
49 * - Timing behaviour of these functions will depend on the length of the inputs. This is fundamentally
50 * the same constraint as the software mbedTLS implementations, and relies on the same
51 * countermeasures (exponent blinding, etc) which are used in mbedTLS.
52 */
53
54 static const __attribute__((unused)) char *TAG = "bignum";
55
56 #define ciL (sizeof(mbedtls_mpi_uint)) /* chars in limb */
57 #define biL (ciL << 3) /* bits in limb */
58
59
60 /* Convert bit count to word count
61 */
bits_to_words(size_t bits)62 static inline size_t bits_to_words(size_t bits)
63 {
64 return (bits + 31) / 32;
65 }
66
67 /* Return the number of words actually used to represent an mpi
68 number.
69 */
70 #if defined(MBEDTLS_MPI_EXP_MOD_ALT) || defined(MBEDTLS_MPI_EXP_MOD_ALT_FALLBACK)
mpi_words(const mbedtls_mpi * mpi)71 static size_t mpi_words(const mbedtls_mpi *mpi)
72 {
73 for (size_t i = mpi->n; i > 0; i--) {
74 if (mpi->p[i - 1] != 0) {
75 return i;
76 }
77 }
78 return 0;
79 }
80
81 #endif //(MBEDTLS_MPI_EXP_MOD_ALT || MBEDTLS_MPI_EXP_MOD_ALT_FALLBACK)
82
83 /**
84 *
85 * There is a need for the value of integer N' such that B^-1(B-1)-N^-1N'=1,
86 * where B^-1(B-1) mod N=1. Actually, only the least significant part of
87 * N' is needed, hence the definition N0'=N' mod b. We reproduce below the
88 * simple algorithm from an article by Dusse and Kaliski to efficiently
89 * find N0' from N0 and b
90 */
modular_inverse(const mbedtls_mpi * M)91 static mbedtls_mpi_uint modular_inverse(const mbedtls_mpi *M)
92 {
93 int i;
94 uint64_t t = 1;
95 uint64_t two_2_i_minus_1 = 2; /* 2^(i-1) */
96 uint64_t two_2_i = 4; /* 2^i */
97 uint64_t N = M->p[0];
98
99 for (i = 2; i <= 32; i++) {
100 if ((mbedtls_mpi_uint) N * t % two_2_i >= two_2_i_minus_1) {
101 t += two_2_i_minus_1;
102 }
103
104 two_2_i_minus_1 <<= 1;
105 two_2_i <<= 1;
106 }
107
108 return (mbedtls_mpi_uint)(UINT32_MAX - t + 1);
109 }
110
111 /* Calculate Rinv = RR^2 mod M, where:
112 *
113 * R = b^n where b = 2^32, n=num_words,
114 * R = 2^N (where N=num_bits)
115 * RR = R^2 = 2^(2*N) (where N=num_bits=num_words*32)
116 *
117 * This calculation is computationally expensive (mbedtls_mpi_mod_mpi)
118 * so caller should cache the result where possible.
119 *
120 * DO NOT call this function while holding esp_mpi_enable_hardware_hw_op().
121 *
122 */
calculate_rinv(mbedtls_mpi * Rinv,const mbedtls_mpi * M,int num_words)123 static int calculate_rinv(mbedtls_mpi *Rinv, const mbedtls_mpi *M, int num_words)
124 {
125 int ret;
126 size_t num_bits = num_words * 32;
127 mbedtls_mpi RR;
128 mbedtls_mpi_init(&RR);
129 MBEDTLS_MPI_CHK(mbedtls_mpi_set_bit(&RR, num_bits * 2, 1));
130 MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(Rinv, &RR, M));
131
132 cleanup:
133 mbedtls_mpi_free(&RR);
134
135 return ret;
136 }
137
138
139
140
141
142
143 /* Z = (X * Y) mod M
144
145 Not an mbedTLS function
146 */
esp_mpi_mul_mpi_mod(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y,const mbedtls_mpi * M)147 int esp_mpi_mul_mpi_mod(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, const mbedtls_mpi *M)
148 {
149 int ret = 0;
150
151 size_t x_bits = mbedtls_mpi_bitlen(X);
152 size_t y_bits = mbedtls_mpi_bitlen(Y);
153 size_t m_bits = mbedtls_mpi_bitlen(M);
154 size_t z_bits = MIN(m_bits, x_bits + y_bits);
155 size_t x_words = bits_to_words(x_bits);
156 size_t y_words = bits_to_words(y_bits);
157 size_t m_words = bits_to_words(m_bits);
158 size_t z_words = bits_to_words(z_bits);
159 size_t hw_words = esp_mpi_hardware_words(MAX(x_words, MAX(y_words, m_words))); /* longest operand */
160 mbedtls_mpi Rinv;
161 mbedtls_mpi_uint Mprime;
162
163 /* Calculate and load the first stage montgomery multiplication */
164 mbedtls_mpi_init(&Rinv);
165 MBEDTLS_MPI_CHK(calculate_rinv(&Rinv, M, hw_words));
166 Mprime = modular_inverse(M);
167
168 esp_mpi_enable_hardware_hw_op();
169 /* Load and start a (X * Y) mod M calculation */
170 esp_mpi_mul_mpi_mod_hw_op(X, Y, M, &Rinv, Mprime, hw_words);
171
172 MBEDTLS_MPI_CHK(mbedtls_mpi_grow(Z, z_words));
173
174 esp_mpi_read_result_hw_op(Z, z_words);
175 Z->s = X->s * Y->s;
176
177 cleanup:
178 mbedtls_mpi_free(&Rinv);
179 esp_mpi_disable_hardware_hw_op();
180
181 return ret;
182 }
183
184 #if defined(MBEDTLS_MPI_EXP_MOD_ALT) || defined(MBEDTLS_MPI_EXP_MOD_ALT_FALLBACK)
185
186 #ifdef ESP_MPI_USE_MONT_EXP
187 /*
188 * Return the most significant one-bit.
189 */
mbedtls_mpi_msb(const mbedtls_mpi * X)190 static size_t mbedtls_mpi_msb( const mbedtls_mpi *X )
191 {
192 int i, j;
193 if (X != NULL && X->n != 0) {
194 for (i = X->n - 1; i >= 0; i--) {
195 if (X->p[i] != 0) {
196 for (j = biL - 1; j >= 0; j--) {
197 if ((X->p[i] & (1 << j)) != 0) {
198 return (i * biL) + j;
199 }
200 }
201 }
202 }
203 }
204 return 0;
205 }
206
207 /*
208 * Montgomery exponentiation: Z = X ^ Y mod M (HAC 14.94)
209 */
mpi_montgomery_exp_calc(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y,const mbedtls_mpi * M,mbedtls_mpi * Rinv,size_t hw_words,mbedtls_mpi_uint Mprime)210 static int mpi_montgomery_exp_calc( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, const mbedtls_mpi *M,
211 mbedtls_mpi *Rinv,
212 size_t hw_words,
213 mbedtls_mpi_uint Mprime )
214 {
215 int ret = 0;
216 mbedtls_mpi X_, one;
217
218 mbedtls_mpi_init(&X_);
219 mbedtls_mpi_init(&one);
220 if ( ( ( ret = mbedtls_mpi_grow(&one, hw_words) ) != 0 ) ||
221 ( ( ret = mbedtls_mpi_set_bit(&one, 0, 1) ) != 0 ) ) {
222 goto cleanup2;
223 }
224
225 // Algorithm from HAC 14.94
226 {
227 // 0 determine t (highest bit set in y)
228 int t = mbedtls_mpi_msb(Y);
229
230 esp_mpi_enable_hardware_hw_op();
231
232 // 1.1 x_ = mont(x, R^2 mod m)
233 // = mont(x, rb)
234 MBEDTLS_MPI_CHK( esp_mont_hw_op(&X_, X, Rinv, M, Mprime, hw_words, false) );
235
236 // 1.2 z = R mod m
237 // now z = R mod m = Mont (R^2 mod m, 1) mod M (as Mont(x) = X&R^-1 mod M)
238 MBEDTLS_MPI_CHK( esp_mont_hw_op(Z, Rinv, &one, M, Mprime, hw_words, true) );
239
240 // 2 for i from t down to 0
241 for (int i = t; i >= 0; i--) {
242 // 2.1 z = mont(z,z)
243 if (i != t) { // skip on the first iteration as is still unity
244 MBEDTLS_MPI_CHK( esp_mont_hw_op(Z, Z, Z, M, Mprime, hw_words, true) );
245 }
246
247 // 2.2 if y[i] = 1 then z = mont(A, x_)
248 if (mbedtls_mpi_get_bit(Y, i)) {
249 MBEDTLS_MPI_CHK( esp_mont_hw_op(Z, Z, &X_, M, Mprime, hw_words, true) );
250 }
251 }
252
253 // 3 z = Mont(z, 1)
254 MBEDTLS_MPI_CHK( esp_mont_hw_op(Z, Z, &one, M, Mprime, hw_words, true) );
255 }
256
257 cleanup:
258 esp_mpi_disable_hardware_hw_op();
259
260 cleanup2:
261 mbedtls_mpi_free(&X_);
262 mbedtls_mpi_free(&one);
263 return ret;
264 }
265
266 #endif //USE_MONT_EXPONENATIATION
267
268 /*
269 * Z = X ^ Y mod M
270 *
271 * _Rinv is optional pre-calculated version of Rinv (via calculate_rinv()).
272 *
273 * (See RSA Accelerator section in Technical Reference for more about Mprime, Rinv)
274 *
275 */
esp_mpi_exp_mod(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y,const mbedtls_mpi * M,mbedtls_mpi * _Rinv)276 static int esp_mpi_exp_mod( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, const mbedtls_mpi *M, mbedtls_mpi *_Rinv )
277 {
278 int ret = 0;
279
280 mbedtls_mpi Rinv_new; /* used if _Rinv == NULL */
281 mbedtls_mpi *Rinv; /* points to _Rinv (if not NULL) othwerwise &RR_new */
282 mbedtls_mpi_uint Mprime;
283
284 size_t x_words = mpi_words(X);
285 size_t y_words = mpi_words(Y);
286 size_t m_words = mpi_words(M);
287
288 /* "all numbers must be the same length", so choose longest number
289 as cardinal length of operation...
290 */
291 size_t num_words = esp_mpi_hardware_words(MAX(m_words, MAX(x_words, y_words)));
292
293 if (num_words * 32 > SOC_RSA_MAX_BIT_LEN) {
294 return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
295 }
296
297 if (mbedtls_mpi_cmp_int(M, 0) <= 0 || (M->p[0] & 1) == 0) {
298 return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
299 }
300
301 if (mbedtls_mpi_cmp_int(Y, 0) < 0) {
302 return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
303 }
304
305 if (mbedtls_mpi_cmp_int(Y, 0) == 0) {
306 return mbedtls_mpi_lset(Z, 1);
307 }
308
309 /* Determine RR pointer, either _RR for cached value
310 or local RR_new */
311 if (_Rinv == NULL) {
312 mbedtls_mpi_init(&Rinv_new);
313 Rinv = &Rinv_new;
314 } else {
315 Rinv = _Rinv;
316 }
317 if (Rinv->p == NULL) {
318 MBEDTLS_MPI_CHK(calculate_rinv(Rinv, M, num_words));
319 }
320
321 Mprime = modular_inverse(M);
322
323 // Montgomery exponentiation: Z = X ^ Y mod M (HAC 14.94)
324 #ifdef ESP_MPI_USE_MONT_EXP
325 ret = mpi_montgomery_exp_calc(Z, X, Y, M, Rinv, num_words, Mprime) ;
326 MBEDTLS_MPI_CHK(ret);
327 #else
328 esp_mpi_enable_hardware_hw_op();
329
330 esp_mpi_exp_mpi_mod_hw_op(X, Y, M, Rinv, Mprime, num_words);
331 ret = mbedtls_mpi_grow(Z, m_words);
332 if (ret != 0) {
333 esp_mpi_disable_hardware_hw_op();
334 goto cleanup;
335 }
336 esp_mpi_read_result_hw_op(Z, m_words);
337 esp_mpi_disable_hardware_hw_op();
338 #endif
339
340 // Compensate for negative X
341 if (X->s == -1 && (Y->p[0] & 1) != 0) {
342 Z->s = -1;
343 MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(Z, M, Z));
344 } else {
345 Z->s = 1;
346 }
347
348 cleanup:
349 if (_Rinv == NULL) {
350 mbedtls_mpi_free(&Rinv_new);
351 }
352 return ret;
353 }
354
355 #endif /* (MBEDTLS_MPI_EXP_MOD_ALT || MBEDTLS_MPI_EXP_MOD_ALT_FALLBACK) */
356
357 /*
358 * Sliding-window exponentiation: X = A^E mod N (HAC 14.85)
359 */
mbedtls_mpi_exp_mod(mbedtls_mpi * X,const mbedtls_mpi * A,const mbedtls_mpi * E,const mbedtls_mpi * N,mbedtls_mpi * _RR)360 int mbedtls_mpi_exp_mod( mbedtls_mpi *X, const mbedtls_mpi *A,
361 const mbedtls_mpi *E, const mbedtls_mpi *N,
362 mbedtls_mpi *_RR )
363 {
364 int ret;
365 #if defined(MBEDTLS_MPI_EXP_MOD_ALT_FALLBACK)
366 /* Try hardware API first and then fallback to software */
367 ret = esp_mpi_exp_mod( X, A, E, N, _RR );
368 if( ret == MBEDTLS_ERR_MPI_NOT_ACCEPTABLE ) {
369 ret = mbedtls_mpi_exp_mod_soft( X, A, E, N, _RR );
370 }
371 #else
372 /* Hardware approach */
373 ret = esp_mpi_exp_mod( X, A, E, N, _RR );
374 #endif
375 /* Note: For software only approach, it gets handled in mbedTLS library.
376 This file is not part of build objects for that case */
377
378 return ret;
379 }
380
381 #if defined(MBEDTLS_MPI_MUL_MPI_ALT) /* MBEDTLS_MPI_MUL_MPI_ALT */
382
383 static int mpi_mult_mpi_failover_mod_mult( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t z_words);
384 static int mpi_mult_mpi_overlong(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t y_words, size_t z_words);
385
386 /* Z = X * Y */
mbedtls_mpi_mul_mpi(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y)387 int mbedtls_mpi_mul_mpi( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y )
388 {
389 int ret = 0;
390 size_t x_bits = mbedtls_mpi_bitlen(X);
391 size_t y_bits = mbedtls_mpi_bitlen(Y);
392 size_t x_words = bits_to_words(x_bits);
393 size_t y_words = bits_to_words(y_bits);
394 size_t z_words = bits_to_words(x_bits + y_bits);
395 size_t hw_words = esp_mpi_hardware_words(MAX(x_words, y_words)); // length of one operand in hardware
396
397 /* Short-circuit eval if either argument is 0 or 1.
398
399 This is needed as the mpi modular division
400 argument will sometimes call in here when one
401 argument is too large for the hardware unit, but the other
402 argument is zero or one.
403 */
404 if (x_bits == 0 || y_bits == 0) {
405 mbedtls_mpi_lset(Z, 0);
406 return 0;
407 }
408 if (x_bits == 1) {
409 ret = mbedtls_mpi_copy(Z, Y);
410 Z->s *= X->s;
411 return ret;
412 }
413 if (y_bits == 1) {
414 ret = mbedtls_mpi_copy(Z, X);
415 Z->s *= Y->s;
416 return ret;
417 }
418
419 /* Grow Z to result size early, avoid interim allocations */
420 MBEDTLS_MPI_CHK( mbedtls_mpi_grow(Z, z_words) );
421
422 /* If either factor is over 2048 bits, we can't use the standard hardware multiplier
423 (it assumes result is double longest factor, and result is max 4096 bits.)
424
425 However, we can fail over to mod_mult for up to 4096 bits of result (modulo
426 multiplication doesn't have the same restriction, so result is simply the
427 number of bits in X plus number of bits in in Y.)
428 */
429 if (hw_words * 32 > SOC_RSA_MAX_BIT_LEN/2) {
430 if (z_words * 32 <= SOC_RSA_MAX_BIT_LEN) {
431 /* Note: it's possible to use mpi_mult_mpi_overlong
432 for this case as well, but it's very slightly
433 slower and requires a memory allocation.
434 */
435 return mpi_mult_mpi_failover_mod_mult(Z, X, Y, z_words);
436 } else {
437 /* Still too long for the hardware unit... */
438 if (y_words > x_words) {
439 return mpi_mult_mpi_overlong(Z, X, Y, y_words, z_words);
440 } else {
441 return mpi_mult_mpi_overlong(Z, Y, X, x_words, z_words);
442 }
443 }
444 }
445
446 /* Otherwise, we can use the (faster) multiply hardware unit */
447 esp_mpi_enable_hardware_hw_op();
448
449 esp_mpi_mul_mpi_hw_op(X, Y, hw_words);
450 esp_mpi_read_result_hw_op(Z, z_words);
451
452 esp_mpi_disable_hardware_hw_op();
453
454 Z->s = X->s * Y->s;
455
456 cleanup:
457 return ret;
458 }
459
mbedtls_mpi_mul_int(mbedtls_mpi * X,const mbedtls_mpi * A,mbedtls_mpi_uint b)460 int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint b )
461 {
462 mbedtls_mpi _B;
463 mbedtls_mpi_uint p[1];
464
465 _B.s = 1;
466 _B.n = 1;
467 _B.p = p;
468 p[0] = b;
469
470 return( mbedtls_mpi_mul_mpi( X, A, &_B ) );
471 }
472
473 /* Deal with the case when X & Y are too long for the hardware unit, by splitting one operand
474 into two halves.
475
476 Y must be the longer operand
477
478 Slice Y into Yp, Ypp such that:
479 Yp = lower 'b' bits of Y
480 Ypp = upper 'b' bits of Y (right shifted)
481
482 Such that
483 Z = X * Y
484 Z = X * (Yp + Ypp<<b)
485 Z = (X * Yp) + (X * Ypp<<b)
486
487 Note that this function may recurse multiple times, if both X & Y
488 are too long for the hardware multiplication unit.
489 */
mpi_mult_mpi_overlong(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y,size_t y_words,size_t z_words)490 static int mpi_mult_mpi_overlong(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t y_words, size_t z_words)
491 {
492 int ret = 0;
493 mbedtls_mpi Ztemp;
494 /* Rather than slicing in two on bits we slice on limbs (32 bit words) */
495 const size_t words_slice = y_words / 2;
496 /* Yp holds lower bits of Y (declared to reuse Y's array contents to save on copying) */
497 const mbedtls_mpi Yp = {
498 .p = Y->p,
499 .n = words_slice,
500 .s = Y->s
501 };
502 /* Ypp holds upper bits of Y, right shifted (also reuses Y's array contents) */
503 const mbedtls_mpi Ypp = {
504 .p = Y->p + words_slice,
505 .n = y_words - words_slice,
506 .s = Y->s
507 };
508 mbedtls_mpi_init(&Ztemp);
509
510 /* Get result Ztemp = Yp * X (need temporary variable Ztemp) */
511 MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi(&Ztemp, X, &Yp) );
512
513 /* Z = Ypp * Y */
514 MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi(Z, X, &Ypp) );
515
516 /* Z = Z << b */
517 MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l(Z, words_slice * 32) );
518
519 /* Z += Ztemp */
520 MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi(Z, Z, &Ztemp) );
521
522 cleanup:
523 mbedtls_mpi_free(&Ztemp);
524
525 return ret;
526 }
527
528 /* Special-case of mbedtls_mpi_mult_mpi(), where we use hardware montgomery mod
529 multiplication to calculate an mbedtls_mpi_mult_mpi result where either
530 A or B are >2048 bits so can't use the standard multiplication method.
531
532 Result (number of words, based on A bits + B bits) must still be less than 4096 bits.
533
534 This case is simpler than the general case modulo multiply of
535 esp_mpi_mul_mpi_mod() because we can control the other arguments:
536
537 * Modulus is chosen with M=(2^num_bits - 1) (ie M=R-1), so output
538 * Mprime and Rinv are therefore predictable as follows:
539 isn't actually modulo anything.
540 Mprime 1
541 Rinv 1
542
543 (See RSA Accelerator section in Technical Reference for more about Mprime, Rinv)
544 */
545
mpi_mult_mpi_failover_mod_mult(mbedtls_mpi * Z,const mbedtls_mpi * X,const mbedtls_mpi * Y,size_t z_words)546 static int mpi_mult_mpi_failover_mod_mult( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t z_words)
547 {
548 int ret;
549 size_t hw_words = esp_mpi_hardware_words(z_words);
550
551 esp_mpi_enable_hardware_hw_op();
552
553 esp_mpi_mult_mpi_failover_mod_mult_hw_op(X, Y, hw_words );
554 MBEDTLS_MPI_CHK( mbedtls_mpi_grow(Z, hw_words) );
555 esp_mpi_read_result_hw_op(Z, hw_words);
556
557 Z->s = X->s * Y->s;
558 cleanup:
559 esp_mpi_disable_hardware_hw_op();
560 return ret;
561 }
562
563 #endif /* MBEDTLS_MPI_MUL_MPI_ALT */
564