1 /*
2 * Copyright (c) 2023-2024, The TrustedFirmware-M Contributors. All rights reserved.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 *
6 */
7
8 #include "cc3xx_pka.h"
9
10 #include "cc3xx_dev.h"
11 #include "cc3xx_config.h"
12 #include "cc3xx_rng.h"
13 #include "cc3xx_endian_helpers.h"
14
15 #include <stdbool.h>
16 #include <assert.h>
17 #include <stdint.h>
18 #include <string.h>
19
20 #define PKA_WORD_SIZE 8
21 #define PKA_WORD_BIT_SIZE (PKA_WORD_SIZE * 8)
22
23 #ifdef CC3XX_CONFIG_HW_VERSION_CC310
24 #define PKA_SRAM_SIZE 0x1000 /* 4KiB */
25 #else
26 #define PKA_SRAM_SIZE 0x1800 /* 6KiB */
27 #endif
28
29 /* The hardware requires and extra word and byte to deal with carries etc
30 * (which would then later be removed by a reduction operation). The TRM
31 * suggests this should be only a word, but the extra byte is required for
32 * mod_exp to function correctly.
33 */
34 #define PKA_MAX_OVERFLOW_SIZE (PKA_WORD_SIZE + 1)
35 #define PKA_MAX_OVERFLOW_BIT_SIZE (PKA_MAX_OVERFLOW_SIZE * 8)
36
37 /* Signed immediates use a two's complement encoding in 5 bits */
38 #define PKA_MAX_SIGNED_IMMEDIATE 15
39 #define PKA_MIN_SIGNED_IMMEDIATE (-16)
40
41 #define PKA_MAX_UNSIGNED_IMMEDIATE 31
42
43 #define PKA_PHYS_REG_TEMP_0 30
44 #define PKA_PHYS_REG_TEMP_1 31
45
46 #define CC3XX_PKA_REG_N_MASK 2
47
48 #define CC3XX_PKA_PHYS_REG_AMOUNT 32
49 #define PKA_RESERVED_PHYS_REG_AMOUNT 5
50 #define PKA_PHYS_REG_FIRST_MAPPABLE (CC3XX_PKA_REG_N_MASK + 1)
51 #define PKA_PHYS_REG_LAST_MAPPABLE (PKA_PHYS_REG_TEMP_0 - 1)
52 #define PKA_VIRT_REG_FIRST_ALLOCATABLE (CC3XX_PKA_REG_N_MASK + 1)
53
54 #define CC3XX_PKA_RANDOM_BUF_SIZE 32
55
56 #ifdef CC3XX_CONFIG_PKA_INLINE_FOR_PERFORMANCE
57 #define CC3XX_ATTRIBUTE_INLINE inline __attribute__((always_inline))
58 #else
59 #define CC3XX_ATTRIBUTE_INLINE
60 #endif
61
62 enum pka_op_size_t {
63 PKA_OP_SIZE_N = 0,
64 PKA_OP_SIZE_REGISTER = 1,
65 };
66
67 /* Where an opcode claims it performs multiple operations, that is achieved by
68 * using immediate or zero operands, not by any actual switching of the
69 * operation being performed.
70 */
71 enum cc3xx_pka_operation_t {
72 CC3XX_PKA_OPCODE_TERMINATE = 0x0,
73 CC3XX_PKA_OPCODE_ADD_INC = 0x4, /* INC is add immediate */
74 CC3XX_PKA_OPCODE_SUB_DEC_NEG = 0x5, /* DEC is add immediate */
75 CC3XX_PKA_OPCODE_MODADD_MODINC = 0x6,
76 CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG = 0x7,
77 CC3XX_PKA_OPCODE_AND_TST0_CLR0 = 0x8,
78 CC3XX_PKA_OPCODE_OR_COPY_SET0 = 0x9,
79 CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE = 0xA,
80 CC3XX_PKA_OPCODE_SHR0 = 0xC,
81 CC3XX_PKA_OPCODE_SHR1 = 0xD,
82 CC3XX_PKA_OPCODE_SHL0 = 0xE,
83 CC3XX_PKA_OPCODE_SHL1 = 0xF,
84 CC3XX_PKA_OPCODE_MULLOW = 0x10,
85 CC3XX_PKA_OPCODE_MODMUL = 0x11,
86 CC3XX_PKA_OPCODE_MODMULN = 0x12,
87 CC3XX_PKA_OPCODE_MODEXP = 0x13,
88 CC3XX_PKA_OPCODE_DIV = 0x14,
89 /* Opcodes below here are not documented in the TRM. */
90 CC3XX_PKA_OPCODE_MODINV = 0x15,
91 CC3XX_PKA_OPCODE_MODDIV = 0x16,
92 CC3XX_PKA_OPCODE_MULHIGH = 0x17U,
93 CC3XX_PKA_OPCODE_MODMLAC = 0x18U,
94 CC3XX_PKA_OPCODE_MODMLACNR = 0x19U,
95 CC3XX_PKA_OPCODE_SEPINT = 0x1AU,
96 CC3XX_PKA_OPCODE_REDUCTION = 0x1BU,
97 };
98
99 /* It seems strange that the state that is external is so small, while things
100 * like the virtual register allocations are internal to the implementation and
101 * therefore not saved in a get_state/set_state operation. In reality,
102 * recalculating the sram addresses is fast, and saving them has downsides
103 * related to the temporary register sram address swapping, so this is a
104 * reasonable approach.
105 */
106 static uint32_t pka_reg_am_max;
107 uint32_t phys_reg_next_mapped;
108 static uint32_t virt_reg_sram_addr[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
109 #ifdef CC3XX_CONFIG_PKA_ALIGN_FOR_PERFORMANCE
110 static uint32_t virt_reg_in_use[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
111 static uint32_t virt_reg_is_mapped[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
112 static uint32_t virt_reg_needs_n_mask[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
113 static uint32_t virt_reg_phys_reg[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
114 static uint32_t phys_reg_mapping_list[CC3XX_PKA_PHYS_REG_AMOUNT];
115 #else
116 static bool virt_reg_in_use[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
117 static bool virt_reg_is_mapped[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
118 static bool virt_reg_needs_n_mask[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
119 static uint8_t virt_reg_phys_reg[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
120 static cc3xx_pka_reg_id_t phys_reg_mapping_list[CC3XX_PKA_PHYS_REG_AMOUNT];
121 #endif /* CC3XX_CONFIG_PKA_ALIGN_FOR_PERFORMANCE */
122
123 static struct cc3xx_pka_state_t pka_state;
124
pka_addr_from_byte_addr(uint32_t offset)125 static inline uint32_t pka_addr_from_byte_addr(uint32_t offset)
126 {
127 return offset / sizeof(uint32_t);
128 }
129
pad_to_pka_word_size(uint32_t byte_size)130 static inline uint32_t pad_to_pka_word_size(uint32_t byte_size)
131 {
132 /* round up to the nearest PKA word */
133 return (((byte_size + PKA_WORD_SIZE - 1) / PKA_WORD_SIZE) * PKA_WORD_SIZE);
134 }
135
cc3xx_lowlevel_pka_unmap_physical_registers(void)136 void cc3xx_lowlevel_pka_unmap_physical_registers(void)
137 {
138 uint32_t idx;
139 cc3xx_pka_reg_id_t virt_reg;
140
141 /* Wait for the pipeline to finish */
142 while(!P_CC3XX->pka.pka_done){}
143
144 for (idx = PKA_PHYS_REG_FIRST_MAPPABLE; idx <= PKA_PHYS_REG_LAST_MAPPABLE; idx++) {
145 virt_reg = phys_reg_mapping_list[idx];
146 if (virt_reg != 0 && virt_reg_is_mapped[virt_reg]) {
147 virt_reg_sram_addr[virt_reg] = P_CC3XX->pka.memory_map[idx];
148 virt_reg_phys_reg[virt_reg] = 0;
149 virt_reg_is_mapped[virt_reg] = false;
150 }
151
152 }
153
154 memset(phys_reg_mapping_list, 0, sizeof(phys_reg_mapping_list));
155
156 for (idx = 0; idx < PKA_PHYS_REG_FIRST_MAPPABLE; idx++) {
157 phys_reg_mapping_list[idx] = idx;
158 }
159
160 phys_reg_next_mapped = PKA_PHYS_REG_FIRST_MAPPABLE;
161 }
162
pka_init_from_state(void)163 static void pka_init_from_state(void)
164 {
165 uint32_t idx;
166
167 P_CC3XX->misc.pka_clk_enable = 1;
168 P_CC3XX->pka.pka_sw_reset = 1;
169
170 /* Wait for SW reset to complete before proceeding */
171 while(!P_CC3XX->pka.pka_done) {}
172
173 /* The TRM says that this register is a byte-size, but it is in fact a
174 * bit-size.
175 */
176 P_CC3XX->pka.pka_l[PKA_OP_SIZE_REGISTER] = pka_state.reg_size * 8;
177
178 assert((pka_state.reg_size & ((PKA_WORD_SIZE) - 1)) == 0);
179 assert(pka_state.reg_size >= (PKA_WORD_SIZE));
180
181 pka_reg_am_max = (PKA_SRAM_SIZE) / pka_state.reg_size;
182 if (pka_reg_am_max >= CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT + 2) {
183 pka_reg_am_max = CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT + 2;
184 }
185
186 /* We need to allocate 4 special registers (and have at least 1 left for an
187 * operation).
188 */
189 assert(pka_reg_am_max > 4);
190
191 /* Unmap all the physical registers */
192 cc3xx_lowlevel_pka_unmap_physical_registers();
193
194 /* Set up the first three regions as N and Np, and N_mask. These are
195 * special, so map them now.
196 */
197 for (idx = 0; idx < PKA_PHYS_REG_FIRST_MAPPABLE; idx++) {
198 virt_reg_is_mapped[idx] = true;
199 virt_reg_phys_reg[idx] = idx;
200 P_CC3XX->pka.memory_map[idx] =
201 pka_addr_from_byte_addr(pka_state.reg_size * idx);
202 virt_reg_sram_addr[idx] =
203 pka_addr_from_byte_addr(pka_state.reg_size * idx);
204 }
205
206 /* Then reserve all but two regions for the general purpose registers */
207 for (; idx < pka_reg_am_max - 2; idx++) {
208 virt_reg_sram_addr[idx] =
209 pka_addr_from_byte_addr(pka_state.reg_size * idx);
210 virt_reg_is_mapped[idx] = 0;
211 virt_reg_phys_reg[idx] = 0;
212 }
213
214 P_CC3XX->pka.memory_map[PKA_PHYS_REG_TEMP_0] =
215 pka_addr_from_byte_addr(pka_state.reg_size * idx);
216
217 idx++;
218
219 P_CC3XX->pka.memory_map[PKA_PHYS_REG_TEMP_1] =
220 pka_addr_from_byte_addr(pka_state.reg_size * idx);
221
222 /* We don't count the temporary registers in reg_am_max, since it's used for
223 * verifying parameters of functions, and these should never be used as
224 * parameters */
225 pka_reg_am_max -= 2;
226 }
227
cc3xx_lowlevel_pka_init(uint32_t size)228 void cc3xx_lowlevel_pka_init(uint32_t size)
229 {
230 cc3xx_lowlevel_pka_uninit();
231
232 /* Minimum size is 16 bytes (128 bits), but just transparently increase it
233 * if needed
234 */
235 if (size < 16) {
236 size = 16;
237 }
238
239 /* Max size of an operation is 256 bytes (2048 bits). The actual max size is
240 * 2112 bits, but 64 bits of overflow are required. */
241 assert(size <= 256);
242
243 /* Calculate the register size based on the requested operation size + the
244 * size by which operations can overflow */
245 pka_state.reg_size = pad_to_pka_word_size(size + PKA_MAX_OVERFLOW_SIZE);
246 pka_state.virt_reg_next_mapped = PKA_VIRT_REG_FIRST_ALLOCATABLE;
247
248 pka_init_from_state();
249 }
250
allocate_phys_reg(cc3xx_pka_reg_id_t virt_reg)251 static void allocate_phys_reg(cc3xx_pka_reg_id_t virt_reg)
252 {
253 uint32_t phys_reg;
254
255 assert(phys_reg_next_mapped <= PKA_PHYS_REG_LAST_MAPPABLE);
256 assert(phys_reg_mapping_list[PKA_PHYS_REG_TEMP_0] == 0);
257 assert(phys_reg_mapping_list[PKA_PHYS_REG_TEMP_1] == 0);
258
259 phys_reg = phys_reg_next_mapped;
260 phys_reg_next_mapped += 1;
261
262 while(!P_CC3XX->pka.pka_done) {}
263 P_CC3XX->pka.memory_map[phys_reg] = virt_reg_sram_addr[virt_reg];
264 while(!P_CC3XX->pka.pka_done) {}
265
266 phys_reg_mapping_list[phys_reg] = virt_reg;
267 virt_reg_is_mapped[virt_reg] = true;
268 virt_reg_phys_reg[virt_reg] = phys_reg;
269 }
270
cc3xx_lowlevel_pka_allocate_reg(void)271 cc3xx_pka_reg_id_t cc3xx_lowlevel_pka_allocate_reg(void)
272 {
273 cc3xx_pka_reg_id_t reg_id = 0;
274
275 reg_id = pka_state.virt_reg_next_mapped;
276 assert(reg_id != pka_reg_am_max);
277
278 pka_state.virt_reg_next_mapped += 1;
279
280 virt_reg_in_use[reg_id] = true;
281
282 return reg_id;
283 }
284
285 /* To make this faster, it's only possible to free the most recently allocated
286 * register. Register freeing must match this pattern.
287 */
cc3xx_lowlevel_pka_free_reg(cc3xx_pka_reg_id_t reg_id)288 void cc3xx_lowlevel_pka_free_reg(cc3xx_pka_reg_id_t reg_id)
289 {
290 assert(reg_id == pka_state.virt_reg_next_mapped - 1);
291 assert(virt_reg_in_use[reg_id]);
292
293 pka_state.virt_reg_next_mapped -= 1;
294
295 virt_reg_in_use[reg_id] = false;
296 }
297
ensure_virt_reg_is_mapped(cc3xx_pka_reg_id_t reg_id)298 static void CC3XX_ATTRIBUTE_INLINE ensure_virt_reg_is_mapped(cc3xx_pka_reg_id_t reg_id)
299 {
300 assert(reg_id <= pka_reg_am_max);
301
302 if (!virt_reg_is_mapped[reg_id]) {
303 allocate_phys_reg(reg_id);
304 }
305 }
306
pka_write_reg(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len,bool swap_endian)307 static void pka_write_reg(cc3xx_pka_reg_id_t reg_id, const uint32_t *data,
308 size_t len, bool swap_endian)
309 {
310 size_t idx;
311
312 /* Check alignment */
313 assert(((uintptr_t)data & (sizeof(uint32_t) - 1)) == 0);
314 /* Check length */
315 assert((len & (sizeof(uint32_t) - 1)) == 0);
316
317
318 /* Check slot */
319 assert(reg_id < pka_reg_am_max);
320 assert(virt_reg_in_use[reg_id]);
321 assert(len <= pka_state.reg_size);
322
323 /* clear the register, so we don't have to explicitly write the upper words
324 */
325 cc3xx_lowlevel_pka_clear(reg_id);
326
327 /* Make sure we have a physical register mapped for the virtual register */
328 ensure_virt_reg_is_mapped(reg_id);
329
330 /* Wait for any outstanding operations to finish before performing reads or
331 * writes on the PKA SRAM
332 */
333 while(!P_CC3XX->pka.pka_done) {}
334 P_CC3XX->pka.pka_sram_addr =
335 P_CC3XX->pka.memory_map[virt_reg_phys_reg[reg_id]];
336 while(!P_CC3XX->pka.pka_done) {}
337
338 /* Write data */
339 for (idx = 0; idx < len / sizeof(uint32_t); idx++) {
340 P_CC3XX->pka.pka_sram_wdata = swap_endian ? bswap_32(data[(len / sizeof(uint32_t) - 1) - idx])
341 : data[idx];
342 while(!P_CC3XX->pka.pka_done) {}
343 }
344 }
345
cc3xx_lowlevel_pka_write_reg_swap_endian(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len)346 void cc3xx_lowlevel_pka_write_reg_swap_endian(cc3xx_pka_reg_id_t reg_id, const uint32_t *data,
347 size_t len)
348 {
349 pka_write_reg(reg_id, (uint32_t *)data, len, true);
350 }
351
cc3xx_lowlevel_pka_write_reg(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len)352 void cc3xx_lowlevel_pka_write_reg(cc3xx_pka_reg_id_t reg_id, const uint32_t *data, size_t len)
353 {
354 pka_write_reg(reg_id, data, len, false);
355 }
356
pka_read_reg(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len,bool swap_endian)357 static void pka_read_reg(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len,
358 bool swap_endian)
359 {
360 size_t idx;
361
362 /* Check alignment */
363 assert(((uintptr_t)data & (sizeof(uint32_t) - 1)) == 0);
364 /* Check length */
365 assert((len & (sizeof(uint32_t) - 1)) == 0);
366
367 /* Check slot */
368 assert(reg_id < pka_reg_am_max);
369 assert(virt_reg_in_use[reg_id]);
370 assert(len <= pka_state.reg_size);
371
372 /* Make sure we have a physical register mapped for the virtual register */
373 ensure_virt_reg_is_mapped(reg_id);
374
375 /* The PKA registers can be remapped by the hardware (by swapping value
376 * values of the memory_map registers), so we need to read the memory_map
377 * register to find the correct address.
378 */
379 while(!P_CC3XX->pka.pka_done) {}
380 P_CC3XX->pka.pka_sram_raddr =
381 P_CC3XX->pka.memory_map[virt_reg_phys_reg[reg_id]];
382 while(!P_CC3XX->pka.pka_done) {}
383
384 /* Read data */
385 for (idx = 0; idx < len / sizeof(uint32_t); idx++) {
386 if (swap_endian) {
387 data[(len / sizeof(uint32_t) -1) - idx] = bswap_32(P_CC3XX->pka.pka_sram_rdata);
388 } else {
389 data[idx] = P_CC3XX->pka.pka_sram_rdata;
390 }
391 }
392 }
393
cc3xx_lowlevel_pka_read_reg(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len)394 void cc3xx_lowlevel_pka_read_reg(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len)
395 {
396 pka_read_reg(reg_id, data, len, false);
397 }
398
cc3xx_lowlevel_pka_read_reg_swap_endian(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len)399 void cc3xx_lowlevel_pka_read_reg_swap_endian(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len)
400 {
401 pka_read_reg(reg_id, (uint32_t *)data, len, true);
402 }
403
404 /* Calculate the Barrett Tag (https://en.wikipedia.org/wiki/Barrett_reduction)
405 * to enable reduction modulo N. If this tag is not calulated, reduction
406 * operations will fail. doi:10.1007/3-540-47721-7_24 is good reference.
407 *
408 * We are attempting to calculate 2^k / N. In the reference the value k = 2 * n
409 * where n is the bit-length of N is chosen due to the max value to be reduced
410 * being representable in 2 * n bits. In the previous driver, instead k = n + 64
411 * (which is the PKA word size), which means the max value to be reduced must be
412 * representable in n + 64 bits. It is assumed, but not certain, that this holds
413 * because of how the reduction in hardware is being calculated.
414 */
calc_Np(void)415 static inline void calc_Np(void)
416 {
417 cc3xx_pka_reg_id_t reg_temp_0 = cc3xx_lowlevel_pka_allocate_reg();
418 cc3xx_pka_reg_id_t reg_temp_1 = cc3xx_lowlevel_pka_allocate_reg();
419 uint32_t N_bit_size = cc3xx_lowlevel_pka_get_bit_size(CC3XX_PKA_REG_N);
420 uint32_t power;
421
422 /* If N is large, we perform a special-case operation to avoid having to
423 * generate 2^n, which may be large. In this case, we first divide N by
424 * 2^(N_bit_size2 * PKA_WORD_BIT_SIZE) and then divide the constant 2^(3 *
425 * PKA_WORD_SIZE) by the result, meaning the largest number we need to
426 * synthesize in a register is 2^(3 * PKA_WORD_BIT_SIZE). This is done so
427 * that if the modulus size is the maximum 2048 bits, then the largest
428 * synthesized number fits into the 2112 bit register+overflow size.
429 */
430 if (N_bit_size > PKA_MAX_OVERFLOW_BIT_SIZE * 2) {
431 power = PKA_MAX_OVERFLOW_BIT_SIZE * 3 - 1;
432 cc3xx_lowlevel_pka_set_to_power_of_two(reg_temp_0, power);
433
434 /* Divide N by 2^(N_bit_size - 2 * PKA_MAX_OVERFLOW_BIT_SIZE) */
435 power = N_bit_size - 2 * PKA_MAX_OVERFLOW_BIT_SIZE;
436 cc3xx_lowlevel_pka_shift_right_fill_0_ui(CC3XX_PKA_REG_N, power, reg_temp_1);
437
438 /* Ceiling */
439 cc3xx_lowlevel_pka_add_si(reg_temp_1, 1, reg_temp_1);
440 cc3xx_lowlevel_pka_div(reg_temp_0, reg_temp_1, CC3XX_PKA_REG_NP, reg_temp_1);
441 } else {
442 /* set r0 to 2^(N_bit_size + PKA_WORD_SIZE - 1) */
443 power = N_bit_size + PKA_MAX_OVERFLOW_BIT_SIZE - 1;
444 cc3xx_lowlevel_pka_set_to_power_of_two(reg_temp_0, power);
445
446 /* Finally, perform the division */
447 cc3xx_lowlevel_pka_div(reg_temp_0, CC3XX_PKA_REG_N, CC3XX_PKA_REG_NP, reg_temp_1);
448 }
449
450 cc3xx_lowlevel_pka_free_reg(reg_temp_1);
451 cc3xx_lowlevel_pka_free_reg(reg_temp_0);
452 }
453
cc3xx_lowlevel_pka_set_modulus(cc3xx_pka_reg_id_t modulus,bool calculate_tag,cc3xx_pka_reg_id_t barrett_tag)454 void cc3xx_lowlevel_pka_set_modulus(cc3xx_pka_reg_id_t modulus, bool calculate_tag,
455 cc3xx_pka_reg_id_t barrett_tag)
456 {
457 uint32_t N_bit_size;
458
459 assert(modulus < pka_reg_am_max);
460 assert(virt_reg_in_use[modulus]);
461
462 virt_reg_in_use[CC3XX_PKA_REG_N] = true;
463 cc3xx_lowlevel_pka_copy(modulus, CC3XX_PKA_REG_N);
464
465 /* This operation size must correspond exactly to the bit-size of the
466 * modulus, so a bit-counting operation is performed.
467 */
468 N_bit_size = cc3xx_lowlevel_pka_get_bit_size(CC3XX_PKA_REG_N);
469 P_CC3XX->pka.pka_l[PKA_OP_SIZE_N] = N_bit_size;
470
471 virt_reg_in_use[CC3XX_PKA_REG_N_MASK] = true;
472 cc3xx_lowlevel_pka_set_to_power_of_two(CC3XX_PKA_REG_N_MASK, N_bit_size);
473 cc3xx_lowlevel_pka_sub_si(CC3XX_PKA_REG_N_MASK, 1, CC3XX_PKA_REG_N_MASK);
474
475 #ifndef CC3XX_CONFIG_PKA_CALC_NP_ENABLE
476 assert(!calculate_tag);
477 #endif /* !CC3XX_CONFIG_PKA_CALC_NP_ENABLE */
478
479 virt_reg_in_use[CC3XX_PKA_REG_NP] = true;
480 if (calculate_tag) {
481 #ifdef CC3XX_CONFIG_PKA_CALC_NP_ENABLE
482 calc_Np();
483 #endif /* CC3XX_CONFIG_PKA_CALC_NP_ENABLE */
484 } else {
485 assert(barrett_tag < pka_reg_am_max);
486 assert(virt_reg_in_use[barrett_tag]);
487
488 cc3xx_lowlevel_pka_copy(barrett_tag, CC3XX_PKA_REG_NP);
489 }
490 }
491
cc3xx_lowlevel_pka_get_state(struct cc3xx_pka_state_t * state,uint32_t save_reg_am,cc3xx_pka_reg_id_t * save_reg_list,uint32_t ** save_reg_ptr_list,const size_t * save_reg_size_list)492 void cc3xx_lowlevel_pka_get_state(struct cc3xx_pka_state_t *state, uint32_t save_reg_am,
493 cc3xx_pka_reg_id_t *save_reg_list,
494 uint32_t **save_reg_ptr_list,
495 const size_t *save_reg_size_list)
496 {
497 size_t idx;
498 cc3xx_pka_reg_id_t reg_id;
499
500 memcpy(state, &pka_state, sizeof(*state));
501
502 for (idx = 0; idx < save_reg_am; idx++) {
503 reg_id = save_reg_list[idx];
504 assert(reg_id < pka_reg_am_max);
505 assert(virt_reg_in_use[reg_id]);
506
507 cc3xx_lowlevel_pka_read_reg(reg_id, save_reg_ptr_list[idx], save_reg_size_list[idx]);
508 }
509 }
510
cc3xx_lowlevel_pka_set_state(const struct cc3xx_pka_state_t * state,uint32_t load_reg_am,cc3xx_pka_reg_id_t * load_reg_list,const uint32_t ** load_reg_ptr_list,const size_t * load_reg_size_list)511 void cc3xx_lowlevel_pka_set_state(const struct cc3xx_pka_state_t *state,
512 uint32_t load_reg_am, cc3xx_pka_reg_id_t *load_reg_list,
513 const uint32_t **load_reg_ptr_list,
514 const size_t *load_reg_size_list)
515 {
516 size_t idx;
517 cc3xx_pka_reg_id_t reg_id;
518
519 memcpy(&pka_state, state, sizeof(*state));
520
521 pka_init_from_state();
522
523 for (idx = 0; idx < load_reg_am; idx++) {
524 reg_id = load_reg_list[idx];
525 assert(reg_id < pka_reg_am_max);
526 assert(virt_reg_in_use[reg_id]);
527
528 cc3xx_lowlevel_pka_write_reg(reg_id, load_reg_ptr_list[idx], load_reg_size_list[idx]);
529 }
530 }
531
cc3xx_lowlevel_pka_uninit(void)532 void cc3xx_lowlevel_pka_uninit(void)
533 {
534 memset(&pka_state, 0, sizeof(pka_state));
535 memset(virt_reg_in_use, 0, sizeof(virt_reg_in_use));
536 memset(virt_reg_is_mapped, 0, sizeof(virt_reg_is_mapped));
537 memset(virt_reg_phys_reg, 0, sizeof(virt_reg_phys_reg));
538 memset(virt_reg_sram_addr, 0, sizeof(virt_reg_sram_addr));
539 memset(virt_reg_needs_n_mask, 0, sizeof(virt_reg_needs_n_mask));
540 memset(phys_reg_mapping_list, 0, sizeof(phys_reg_mapping_list));
541 phys_reg_next_mapped = 0;
542
543 P_CC3XX->misc.pka_clk_enable = 0;
544 }
545
opcode_construct(enum cc3xx_pka_operation_t op,enum pka_op_size_t size,bool r0_is_immediate,uint32_t r0,bool r1_is_immediate,uint32_t r1,bool discard_result,uint32_t res)546 static uint32_t CC3XX_ATTRIBUTE_INLINE opcode_construct(enum cc3xx_pka_operation_t op,
547 enum pka_op_size_t size,
548 bool r0_is_immediate, uint32_t r0,
549 bool r1_is_immediate, uint32_t r1,
550 bool discard_result, uint32_t res)
551 {
552 uint32_t opcode = 0;
553
554 /* The tag part of the opcode register is designed to be used to debug PKA
555 * operations, but we don't use this functionality. For some of the opcodes
556 * that aren't documented in the TRM, this is used as a third register
557 * input.
558 */
559 /* opcode |= r3 & 0b11111; */
560
561 /* The top bit of the output register select is a field which if set
562 * prevents the operation writing the output register (or more accurately,
563 * prevents the swapping of the virtual address of the output register and
564 * the temporary register). The pka_status register is still set, so flags
565 * such as the sign of the result can still be used.
566 */
567 if (!discard_result) {
568 assert(res >= 0);
569 assert(res < pka_reg_am_max);
570 assert(virt_reg_in_use[res]);
571 /* Make sure we have a physical register mapped for the virtual register */
572 ensure_virt_reg_is_mapped(res);
573 opcode |= (virt_reg_phys_reg[res] & 0b11111) << 6;
574 } else {
575 opcode |= (discard_result & 0b1) << 11;
576 }
577
578 /* The top bit of the REG_A field is a toggle between being a register ID
579 * and an immediate, and the lower 5 bits give us either a 0-31 register ID,
580 * a -16-15 signed immediate or a 0-31 unsigned immediate depending on the
581 * operation.
582 */
583 if (r1_is_immediate) {
584 opcode |= (r1_is_immediate & 0b1) << 17;
585 opcode |= (r1 & 0b11111) << 12;
586 } else {
587 assert(r1 >= 0);
588 assert(r1 < pka_reg_am_max);
589 assert(virt_reg_in_use[r1]);
590 /* Make sure we have a physical register mapped for the virtual register */
591 ensure_virt_reg_is_mapped(r1);
592 opcode |= (virt_reg_phys_reg[r1] & 0b11111) << 12;
593 }
594
595 /* For unclear reasons, the immediate (shift amount) for shift opcodes
596 * doesn't use the upper bit to denote that it isn't a register.
597 * Possibly because this opcode doesn't support register input?
598 */
599 if (op >= CC3XX_PKA_OPCODE_SHR0 && op <= CC3XX_PKA_OPCODE_SHL1) {
600 opcode &= ~(0b1 << 17);
601 }
602
603 /* The top bit of the REG_B field is a toggle between being a register ID
604 * and an immediate, and the lower 5 bits give us either a 0-31 register ID,
605 * a -16-15 signed immediate or a 0-31 unsigned immediate depending on the
606 * operation.
607 */
608 if (r0_is_immediate) {
609 opcode |= (r0_is_immediate & 0b1) << 23;
610 opcode |= (r0 & 0b11111) << 18;
611 } else {
612 assert(r0 >= 0);
613 assert(r0 <= pka_reg_am_max);
614 assert(virt_reg_in_use[r0]);
615 /* Make sure we have a physical register mapped for the virtual register */
616 ensure_virt_reg_is_mapped(r0);
617 opcode |= (virt_reg_phys_reg[r0] & 0b11111) << 18;
618 }
619
620 if (!r0_is_immediate) {
621 assert(virt_reg_is_mapped[r0]);
622 }
623
624 if (!r1_is_immediate) {
625 assert(virt_reg_is_mapped[r1]);
626 if (!r0_is_immediate && r0 != r1) {
627 assert(virt_reg_phys_reg[r1] != virt_reg_phys_reg[r0]);
628 }
629 }
630
631 if (!discard_result) {
632 assert(virt_reg_is_mapped[res]);
633 if (!r0_is_immediate && r0 != res) {
634 assert(virt_reg_phys_reg[res] != virt_reg_phys_reg[r0]);
635 }
636 if (!r1_is_immediate && r1 != res) {
637 assert(virt_reg_phys_reg[res] != virt_reg_phys_reg[r1]);
638 }
639 }
640
641 /* Select which of the pka_l register is used for the bit-length of the
642 * operation.
643 */
644 opcode |= (size & 0b111) << 24;
645
646 /* Set the actual operation */
647 opcode |= (op & 0b11111) << 27;
648
649 /* Wait for a pipeline slot to be free before submitting this operation.
650 * Note that the previous operations may still be in progress at this point.
651 */
652 while(!P_CC3XX->pka.pka_pipe_rdy) {}
653
654 return opcode;
655 }
656
cc3xx_lowlevel_pka_get_bit_size(cc3xx_pka_reg_id_t r0)657 uint32_t cc3xx_lowlevel_pka_get_bit_size(cc3xx_pka_reg_id_t r0)
658 {
659 int32_t idx;
660 uint32_t word;
661
662 ensure_virt_reg_is_mapped(r0);
663
664 /* This isn't an operation that can use the PKA pipeline, so we need to wait
665 * for the pipeline to be finished before reading the SRAM.
666 */
667 while(!P_CC3XX->pka.pka_done) {}
668
669 for (idx = pka_state.reg_size - sizeof(uint32_t); idx >= 0;
670 idx -= sizeof(uint32_t)) {
671 P_CC3XX->pka.pka_sram_raddr =
672 P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] +
673 pka_addr_from_byte_addr(idx);
674 while(!P_CC3XX->pka.pka_done) {}
675
676 word = P_CC3XX->pka.pka_sram_rdata;
677
678 if (word) {
679 break;
680 }
681 }
682
683 if (idx < 0) {
684 return 0;
685 } else {
686 return idx * 8 + (32 - __builtin_clz(word));
687 }
688 }
689
cc3xx_lowlevel_pka_set_to_power_of_two(cc3xx_pka_reg_id_t r0,uint32_t power)690 void cc3xx_lowlevel_pka_set_to_power_of_two(cc3xx_pka_reg_id_t r0, uint32_t power)
691 {
692 uint32_t final_word = 1 << (power % (sizeof(uint32_t) * 8));
693 uint32_t word_offset = power / (8 * sizeof(uint32_t));
694
695 cc3xx_lowlevel_pka_clear(r0);
696
697 ensure_virt_reg_is_mapped(r0);
698
699 /* This isn't an operation that can use the PKA pipeline, so we need to wait
700 * for the pipeline to be finished before reading the SRAM.
701 */
702 while(!P_CC3XX->pka.pka_done) {}
703
704 P_CC3XX->pka.pka_sram_addr =
705 P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] + word_offset;
706 while(!P_CC3XX->pka.pka_done) {}
707
708 P_CC3XX->pka.pka_sram_wdata = final_word;
709 while(!P_CC3XX->pka.pka_done) {}
710 }
711
712 #ifdef CC3XX_CONFIG_RNG_ENABLE
cc3xx_lowlevel_pka_set_to_random(cc3xx_pka_reg_id_t r0,size_t bit_len)713 cc3xx_err_t cc3xx_lowlevel_pka_set_to_random(cc3xx_pka_reg_id_t r0, size_t bit_len)
714 {
715 uint32_t byte_size = (bit_len + 7) / 8;
716 uint32_t word_size = (byte_size + 3) / sizeof(uint32_t);
717 uint32_t random_buf[word_size];
718 cc3xx_err_t err;
719
720 err = cc3xx_lowlevel_rng_get_random((uint8_t *)random_buf, word_size * sizeof(uint32_t));
721 if (err != CC3XX_ERR_SUCCESS) {
722 return err;
723 }
724
725 /* Take off any extra bits */
726 random_buf[word_size - 1] = random_buf[word_size - 1] >> (32 - (bit_len % 32));
727
728 cc3xx_lowlevel_pka_write_reg(r0, random_buf, sizeof(random_buf));
729
730 return CC3XX_ERR_SUCCESS;
731 }
732
cc3xx_lowlevel_pka_set_to_random_within_modulus(cc3xx_pka_reg_id_t r0)733 cc3xx_err_t cc3xx_lowlevel_pka_set_to_random_within_modulus(cc3xx_pka_reg_id_t r0)
734 {
735 cc3xx_err_t err;
736 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
737
738 do {
739 /* This uses the simple discard method from SP800-90A, because the modular
740 * methods are impractical due to the pka_reduce function not working for
741 * numbers significantly greater than OP_SIZE_N.
742 */
743 err = cc3xx_lowlevel_pka_set_to_random(r0, P_CC3XX->pka.pka_l[PKA_OP_SIZE_N]);
744 if (err != CC3XX_ERR_SUCCESS) {
745 return err;
746 }
747 } while (!cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
748
749 return CC3XX_ERR_SUCCESS;
750 }
751 #endif /* CC3XX_CONFIG_RNG_ENABLE */
752
cc3xx_lowlevel_pka_add(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)753 void cc3xx_lowlevel_pka_add(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
754 {
755 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_ADD_INC,
756 PKA_OP_SIZE_REGISTER,
757 false, r0, false, r1, false, res);
758 }
759
cc3xx_lowlevel_pka_add_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)760 void cc3xx_lowlevel_pka_add_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
761 {
762 assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
763 assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
764
765 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_ADD_INC,
766 PKA_OP_SIZE_REGISTER,
767 false, r0, true, imm, false, res);
768 }
769
cc3xx_lowlevel_pka_sub(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)770 void cc3xx_lowlevel_pka_sub(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
771 {
772 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
773 PKA_OP_SIZE_REGISTER,
774 false, r0, false, r1, false, res);
775 }
776
cc3xx_lowlevel_pka_sub_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)777 void cc3xx_lowlevel_pka_sub_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
778 {
779 assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
780 assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
781
782 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
783 PKA_OP_SIZE_REGISTER,
784 false, r0, true, imm, false, res);
785 }
786
cc3xx_lowlevel_pka_neg(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)787 void cc3xx_lowlevel_pka_neg(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
788 {
789 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
790 PKA_OP_SIZE_REGISTER,
791 true, 0, false, r0, false, res);
792 }
793
cc3xx_lowlevel_pka_mod_add(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)794 void cc3xx_lowlevel_pka_mod_add(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
795 {
796 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
797 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
798 assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
799
800 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODADD_MODINC,
801 PKA_OP_SIZE_REGISTER,
802 false, r0, false, r1, false, res);
803 }
804
cc3xx_lowlevel_pka_mod_add_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)805 void cc3xx_lowlevel_pka_mod_add_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
806 {
807 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
808
809 assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
810 assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
811
812 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
813 assert(cc3xx_lowlevel_pka_greater_than_si(CC3XX_PKA_REG_N, imm));
814
815 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODADD_MODINC,
816 PKA_OP_SIZE_REGISTER,
817 false, r0, true, imm, false, res);
818 }
819
cc3xx_lowlevel_pka_mod_sub(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)820 void cc3xx_lowlevel_pka_mod_sub(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
821 {
822 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
823 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
824 assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
825
826 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
827 PKA_OP_SIZE_REGISTER,
828 false, r0, false, r1, false, res);
829 }
830
cc3xx_lowlevel_pka_mod_sub_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)831 void cc3xx_lowlevel_pka_mod_sub_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
832 {
833 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
834
835 assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
836 assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
837
838 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
839 assert(cc3xx_lowlevel_pka_greater_than_si(CC3XX_PKA_REG_N, imm));
840
841 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
842 PKA_OP_SIZE_REGISTER,
843 false, r0, true, imm, false, res);
844 }
845
cc3xx_lowlevel_pka_mod_neg(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)846 void cc3xx_lowlevel_pka_mod_neg(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
847 {
848 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
849 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
850
851 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
852 PKA_OP_SIZE_REGISTER,
853 true, 0, false, r0, false, res);
854 }
855
cc3xx_lowlevel_pka_and(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)856 void cc3xx_lowlevel_pka_and(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
857 {
858 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
859 PKA_OP_SIZE_REGISTER,
860 false, r0, false, r1, false, res);
861 }
862
cc3xx_lowlevel_pka_and_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)863 void cc3xx_lowlevel_pka_and_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
864 {
865 assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
866
867 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
868 PKA_OP_SIZE_REGISTER,
869 false, r0, true, mask, false, res);
870 }
871
cc3xx_lowlevel_pka_test_bits_ui(cc3xx_pka_reg_id_t r0,uint32_t idx,uint32_t bit_am)872 uint32_t cc3xx_lowlevel_pka_test_bits_ui(cc3xx_pka_reg_id_t r0, uint32_t idx, uint32_t bit_am)
873 {
874 uint32_t bits;
875 uint32_t word_offset = idx / (8 * sizeof(uint32_t));
876
877 assert(bit_am <= 4);
878 /* This prevents us from needing to read two words */
879 assert(idx % bit_am == 0);
880
881 ensure_virt_reg_is_mapped(r0);
882
883 while(!P_CC3XX->pka.pka_done) {}
884 P_CC3XX->pka.pka_sram_raddr =
885 P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] + word_offset;
886 while(!P_CC3XX->pka.pka_done) {}
887
888 bits = (P_CC3XX->pka.pka_sram_rdata >> (idx % 32)) & ((1 << bit_am) - 1);
889 while(!P_CC3XX->pka.pka_done) {}
890
891 /* Return the inverted value of ALU_OUT_ZERO */
892 return bits;
893 }
894
cc3xx_lowlevel_pka_clear_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)895 void cc3xx_lowlevel_pka_clear_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
896 {
897 /* Check that we can construct the required mask */
898 assert((0x1 << idx) <= PKA_MAX_UNSIGNED_IMMEDIATE);
899
900 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
901 PKA_OP_SIZE_REGISTER,
902 false, r0, true, ~(1 << idx), false, res);
903 }
904
cc3xx_lowlevel_pka_clear(cc3xx_pka_reg_id_t r0)905 void cc3xx_lowlevel_pka_clear(cc3xx_pka_reg_id_t r0)
906 {
907 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
908 PKA_OP_SIZE_REGISTER,
909 false, r0, true, 0, false, r0);
910 }
911
cc3xx_lowlevel_pka_or(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)912 void cc3xx_lowlevel_pka_or(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
913 {
914 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
915 PKA_OP_SIZE_REGISTER,
916 false, r0, false, r1, false, res);
917 }
918
cc3xx_lowlevel_pka_or_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)919 void cc3xx_lowlevel_pka_or_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
920 {
921 assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
922
923 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
924 PKA_OP_SIZE_REGISTER,
925 false, r0, true, mask, false, res);
926 }
927
cc3xx_lowlevel_pka_copy(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)928 void cc3xx_lowlevel_pka_copy(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
929 {
930 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
931 PKA_OP_SIZE_REGISTER,
932 false, r0, true, 0, false, res);
933 }
934
cc3xx_lowlevel_pka_set_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)935 void cc3xx_lowlevel_pka_set_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
936 {
937 assert(idx < 32);
938
939 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
940 PKA_OP_SIZE_REGISTER,
941 false, r0, true, 1 << idx, false, res);
942 }
943
cc3xx_lowlevel_pka_xor(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)944 void cc3xx_lowlevel_pka_xor(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
945 {
946 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
947 PKA_OP_SIZE_REGISTER,
948 false, r0, false, r1, false, res);
949 }
950
cc3xx_lowlevel_pka_xor_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)951 void cc3xx_lowlevel_pka_xor_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
952 {
953 assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
954
955 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
956 PKA_OP_SIZE_REGISTER,
957 false, r0, true, mask, false, res);
958 }
959
cc3xx_lowlevel_pka_flip_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)960 void cc3xx_lowlevel_pka_flip_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
961 {
962 assert(idx < 32);
963
964 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
965 PKA_OP_SIZE_REGISTER,
966 false, r0, true, 1 << idx, false, res);
967 }
968
cc3xx_lowlevel_pka_are_equal(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)969 bool cc3xx_lowlevel_pka_are_equal(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
970 {
971 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
972 PKA_OP_SIZE_REGISTER,
973 false, r0, false, r1, true, 0);
974
975
976 /* We need the pipeline to finish before we read the status register for the
977 * result.
978 */
979 while(!P_CC3XX->pka.pka_done) {}
980
981 /* Return ALU_OUT_ZERO */
982 return P_CC3XX->pka.pka_status & (0b1 << 12);
983 }
984
cc3xx_lowlevel_pka_are_equal_si(cc3xx_pka_reg_id_t r0,int32_t imm)985 bool cc3xx_lowlevel_pka_are_equal_si(cc3xx_pka_reg_id_t r0, int32_t imm)
986 {
987 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
988 PKA_OP_SIZE_REGISTER,
989 false, r0, true, imm, true, 0);
990
991
992 /* We need the pipeline to finish before we read the status register for the
993 * result.
994 */
995 while(!P_CC3XX->pka.pka_done) {}
996
997 /* Return ALU_OUT_ZERO */
998 return P_CC3XX->pka.pka_status & (0b1 << 12);
999 }
1000
cc3xx_lowlevel_pka_less_than(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)1001 bool cc3xx_lowlevel_pka_less_than(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
1002 {
1003 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
1004 PKA_OP_SIZE_REGISTER,
1005 false, r0, false, r1, true, 0);
1006
1007 /* Wait for the pipeline to be finished before reading the pka status
1008 * register.
1009 */
1010 while(!P_CC3XX->pka.pka_done) {}
1011
1012 /* Return the value of ALU_SIGN_OUT */
1013 return P_CC3XX->pka.pka_status & (0b1 << 8);
1014 }
1015
cc3xx_lowlevel_pka_less_than_si(cc3xx_pka_reg_id_t r0,int32_t imm)1016 bool cc3xx_lowlevel_pka_less_than_si(cc3xx_pka_reg_id_t r0, int32_t imm)
1017 {
1018 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
1019 PKA_OP_SIZE_REGISTER,
1020 false, r0, true, imm, true, 0);
1021
1022 /* Wait for the pipeline to be finished before reading the pka status
1023 * register.
1024 */
1025 while(!P_CC3XX->pka.pka_done) {}
1026
1027 /* Return the value of ALU_SIGN_OUT */
1028 return P_CC3XX->pka.pka_status & (0b1 << 8);
1029 }
1030
cc3xx_lowlevel_pka_greater_than(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)1031 bool cc3xx_lowlevel_pka_greater_than(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
1032 {
1033 return !cc3xx_lowlevel_pka_less_than(r0, r1)
1034 && !cc3xx_lowlevel_pka_are_equal(r0, r1);
1035 }
1036
cc3xx_lowlevel_pka_greater_than_si(cc3xx_pka_reg_id_t r0,int32_t imm)1037 bool cc3xx_lowlevel_pka_greater_than_si(cc3xx_pka_reg_id_t r0, int32_t imm)
1038 {
1039 return !cc3xx_lowlevel_pka_less_than_si(r0, imm)
1040 && !cc3xx_lowlevel_pka_are_equal_si(r0, imm);
1041 }
1042
cc3xx_lowlevel_pka_shift_right_fill_0_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1043 void cc3xx_lowlevel_pka_shift_right_fill_0_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1044 {
1045 uint32_t shift_am;
1046
1047 /* The shift operations shifts by 1 more than the number requested, so for
1048 * the sake of sensible semantics we decrease the shift number by 1.
1049 * Shifting by 0 is technically reasonable, but we can decrease code-size by
1050 * disallowing it via this assert.
1051 */
1052
1053 if (shift == 0) {
1054 cc3xx_lowlevel_pka_copy(r0, res);
1055 }
1056
1057 while(shift > 0) {
1058 shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1059
1060 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHR0,
1061 PKA_OP_SIZE_REGISTER,
1062 false, r0, true, shift_am - 1, false, res);
1063 shift -= shift_am;
1064 r0 = res;
1065 }
1066 }
1067
cc3xx_lowlevel_pka_shift_right_fill_1_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1068 void cc3xx_lowlevel_pka_shift_right_fill_1_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1069 {
1070 uint32_t shift_am;
1071
1072 if (shift == 0) {
1073 cc3xx_lowlevel_pka_copy(r0, res);
1074 }
1075
1076 while(shift > 0) {
1077 shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1078
1079 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHR1,
1080 PKA_OP_SIZE_REGISTER,
1081 false, r0, true, shift_am - 1, false, res);
1082 shift -= shift_am;
1083 r0 = res;
1084 }
1085 }
1086
cc3xx_lowlevel_pka_shift_left_fill_0_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1087 void cc3xx_lowlevel_pka_shift_left_fill_0_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1088 {
1089 uint32_t shift_am;
1090
1091 if (shift == 0) {
1092 cc3xx_lowlevel_pka_copy(r0, res);
1093 }
1094
1095 while(shift > 0) {
1096 shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1097
1098 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHL0,
1099 PKA_OP_SIZE_REGISTER,
1100 false, r0, true, shift_am - 1, false, res);
1101 shift -= shift_am;
1102 r0 = res;
1103 }
1104 }
1105
cc3xx_lowlevel_pka_shift_left_fill_1_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1106 void cc3xx_lowlevel_pka_shift_left_fill_1_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1107 {
1108 uint32_t shift_am;
1109
1110 if (shift == 0) {
1111 cc3xx_lowlevel_pka_copy(r0, res);
1112 }
1113
1114 while(shift > 0) {
1115 shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1116
1117 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHL1,
1118 PKA_OP_SIZE_REGISTER,
1119 false, r0, true, shift_am - 1, false, res);
1120 shift -= shift_am;
1121 r0 = res;
1122 }
1123 }
1124
cc3xx_lowlevel_pka_mul_low_half(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1125 void cc3xx_lowlevel_pka_mul_low_half(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1126 {
1127 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MULLOW,
1128 PKA_OP_SIZE_REGISTER,
1129 false, r0, false, r1, false, res);
1130 }
1131
cc3xx_lowlevel_pka_mul_high_half(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1132 void cc3xx_lowlevel_pka_mul_high_half(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1133 {
1134 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MULHIGH,
1135 PKA_OP_SIZE_REGISTER,
1136 false, r0, false, r1, false, res);
1137 }
1138
cc3xx_lowlevel_pka_div(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t quotient,cc3xx_pka_reg_id_t remainder)1139 void cc3xx_lowlevel_pka_div(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t quotient,
1140 cc3xx_pka_reg_id_t remainder)
1141 {
1142 cc3xx_pka_reg_id_t temp_r0 = cc3xx_lowlevel_pka_allocate_reg();
1143 cc3xx_pka_reg_id_t temp_r1;
1144
1145 /* Since the div operation uses r0 to store the remainder, and we want to
1146 * avoid clobbering input registers, perform a copy first.
1147 */
1148 cc3xx_lowlevel_pka_copy(r0, temp_r0);
1149
1150 /* If r1 is also the quotient register, this produces no result. In this
1151 * case, copy to a temporary register.
1152 */
1153 if (r1 == quotient) {
1154 temp_r1 = cc3xx_lowlevel_pka_allocate_reg();
1155 cc3xx_lowlevel_pka_copy(r1, temp_r1);
1156 } else {
1157 temp_r1 = r1;
1158 }
1159
1160 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_DIV,
1161 PKA_OP_SIZE_REGISTER,
1162 false, temp_r0, false, temp_r1,
1163 false, quotient);
1164
1165 /* Now clobber the remainder register */
1166 cc3xx_lowlevel_pka_copy(temp_r0, remainder);
1167
1168 if (temp_r1 != r1) {
1169 cc3xx_lowlevel_pka_free_reg(temp_r1);
1170 }
1171 cc3xx_lowlevel_pka_free_reg(temp_r0);
1172 }
1173
cc3xx_lowlevel_pka_mod_mul(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1174 void cc3xx_lowlevel_pka_mod_mul(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1175 {
1176 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1177 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1178 assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
1179
1180 /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1181 * because it performs reduction, since mod_add uses _REGISTER, but because
1182 * it does not use the ALU, but the special-purpose modular multiplier.
1183 */
1184 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODMUL,
1185 PKA_OP_SIZE_N,
1186 false, r0, false, r1, false, res);
1187
1188
1189 /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1190 * top words. Do a mask operation to clear these
1191 */
1192 cc3xx_lowlevel_pka_and(res, CC3XX_PKA_REG_N_MASK, res);
1193 }
1194
cc3xx_lowlevel_pka_mod_mul_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)1195 void cc3xx_lowlevel_pka_mod_mul_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
1196 {
1197 cc3xx_pka_reg_id_t temp_reg = cc3xx_lowlevel_pka_allocate_reg();
1198
1199 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1200 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1201
1202 /* This operation doesn't work with negative numbers */
1203 assert(imm >= 0);
1204
1205 /* temp_reg starts at 0, so this is effectively a set */
1206 cc3xx_lowlevel_pka_clear(temp_reg);
1207 cc3xx_lowlevel_pka_add_si(temp_reg, imm, temp_reg);
1208
1209 cc3xx_lowlevel_pka_mod_mul(r0, temp_reg, res);
1210
1211 cc3xx_lowlevel_pka_free_reg(temp_reg);
1212 }
1213
cc3xx_lowlevel_pka_mod_exp(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1214 void cc3xx_lowlevel_pka_mod_exp(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1215 {
1216 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1217 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1218 assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
1219
1220 /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1221 * because it performs reduction, since mod_add uses _REGISTER, but because
1222 * it does not use the ALU, but the special-purpose modular multiplier.
1223 */
1224 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODEXP,
1225 PKA_OP_SIZE_N,
1226 false, r0, false, r1, false, res);
1227
1228 /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1229 * top words. Do a mask operation to clear these
1230 */
1231 cc3xx_lowlevel_pka_and(res, CC3XX_PKA_REG_N_MASK, res);
1232 }
1233
cc3xx_lowlevel_pka_mod_exp_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)1234 void cc3xx_lowlevel_pka_mod_exp_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
1235 {
1236 cc3xx_pka_reg_id_t temp_reg = cc3xx_lowlevel_pka_allocate_reg();
1237
1238 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1239 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1240 assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
1241 assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
1242
1243 /* This operation doesn't work with negative numbers */
1244 assert(imm >= 0);
1245
1246 /* temp_reg starts at 0, so this is effectively a set */
1247 cc3xx_lowlevel_pka_clear(temp_reg);
1248 cc3xx_lowlevel_pka_add_si(temp_reg, imm, temp_reg);
1249
1250 cc3xx_lowlevel_pka_mod_exp(r0, temp_reg, res);
1251
1252 cc3xx_lowlevel_pka_free_reg(temp_reg);
1253 }
1254
cc3xx_lowlevel_pka_mod_inv(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)1255 void cc3xx_lowlevel_pka_mod_inv(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
1256 {
1257 cc3xx_pka_reg_id_t n_minus_2 = cc3xx_lowlevel_pka_allocate_reg();
1258
1259 /* Use the special-case Euler theorem a^-1 = a^N-2 mod N */
1260 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1261 assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1262
1263 cc3xx_lowlevel_pka_sub_si(CC3XX_PKA_REG_N, 2, n_minus_2);
1264 cc3xx_lowlevel_pka_mod_exp(r0, n_minus_2, res);
1265
1266 cc3xx_lowlevel_pka_free_reg(n_minus_2);
1267 }
1268
cc3xx_lowlevel_pka_reduce(cc3xx_pka_reg_id_t r0)1269 void cc3xx_lowlevel_pka_reduce(cc3xx_pka_reg_id_t r0)
1270 {
1271 assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1272
1273 /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1274 * because it performs reduction, since mod_add uses _REGISTER, but because
1275 * it does not use the ALU, but the special-purpose modular multiplier.
1276 */
1277 P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_REDUCTION,
1278 PKA_OP_SIZE_N,
1279 false, r0, false, 0, false, r0);
1280
1281 /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1282 * top words. Do a mask operation to clear these
1283 */
1284 cc3xx_lowlevel_pka_and(r0, CC3XX_PKA_REG_N_MASK, r0);
1285 }
1286