1 /*
2  * Copyright (c) 2023-2024, The TrustedFirmware-M Contributors. All rights reserved.
3  *
4  * SPDX-License-Identifier: BSD-3-Clause
5  *
6  */
7 
8 #include "cc3xx_pka.h"
9 
10 #include "cc3xx_dev.h"
11 #include "cc3xx_config.h"
12 #include "cc3xx_rng.h"
13 #include "cc3xx_endian_helpers.h"
14 
15 #include <stdbool.h>
16 #include <assert.h>
17 #include <stdint.h>
18 #include <string.h>
19 
20 #define PKA_WORD_SIZE  8
21 #define PKA_WORD_BIT_SIZE  (PKA_WORD_SIZE * 8)
22 
23 #ifdef CC3XX_CONFIG_HW_VERSION_CC310
24 #define PKA_SRAM_SIZE 0x1000 /* 4KiB */
25 #else
26 #define PKA_SRAM_SIZE 0x1800 /* 6KiB */
27 #endif
28 
29 /* The hardware requires and extra word and byte to deal with carries etc
30  * (which would then later be removed by a reduction operation). The TRM
31  * suggests this should be only a word, but the extra byte is required for
32  * mod_exp to function correctly.
33  */
34 #define PKA_MAX_OVERFLOW_SIZE     (PKA_WORD_SIZE + 1)
35 #define PKA_MAX_OVERFLOW_BIT_SIZE (PKA_MAX_OVERFLOW_SIZE * 8)
36 
37 /* Signed immediates use a two's complement encoding in 5 bits */
38 #define PKA_MAX_SIGNED_IMMEDIATE 15
39 #define PKA_MIN_SIGNED_IMMEDIATE (-16)
40 
41 #define PKA_MAX_UNSIGNED_IMMEDIATE 31
42 
43 #define PKA_PHYS_REG_TEMP_0 30
44 #define PKA_PHYS_REG_TEMP_1 31
45 
46 #define CC3XX_PKA_REG_N_MASK 2
47 
48 #define CC3XX_PKA_PHYS_REG_AMOUNT 32
49 #define PKA_RESERVED_PHYS_REG_AMOUNT 5
50 #define PKA_PHYS_REG_FIRST_MAPPABLE (CC3XX_PKA_REG_N_MASK + 1)
51 #define PKA_PHYS_REG_LAST_MAPPABLE (PKA_PHYS_REG_TEMP_0 - 1)
52 #define PKA_VIRT_REG_FIRST_ALLOCATABLE (CC3XX_PKA_REG_N_MASK + 1)
53 
54 #define CC3XX_PKA_RANDOM_BUF_SIZE 32
55 
56 #ifdef CC3XX_CONFIG_PKA_INLINE_FOR_PERFORMANCE
57 #define CC3XX_ATTRIBUTE_INLINE inline __attribute__((always_inline))
58 #else
59 #define CC3XX_ATTRIBUTE_INLINE
60 #endif
61 
62 enum pka_op_size_t {
63     PKA_OP_SIZE_N = 0,
64     PKA_OP_SIZE_REGISTER = 1,
65 };
66 
67 /* Where an opcode claims it performs multiple operations, that is achieved by
68  * using immediate or zero operands, not by any actual switching of the
69  * operation being performed.
70  */
71 enum cc3xx_pka_operation_t {
72     CC3XX_PKA_OPCODE_TERMINATE = 0x0,
73     CC3XX_PKA_OPCODE_ADD_INC = 0x4, /* INC is add immediate */
74     CC3XX_PKA_OPCODE_SUB_DEC_NEG = 0x5, /* DEC is add immediate */
75     CC3XX_PKA_OPCODE_MODADD_MODINC = 0x6,
76     CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG = 0x7,
77     CC3XX_PKA_OPCODE_AND_TST0_CLR0 = 0x8,
78     CC3XX_PKA_OPCODE_OR_COPY_SET0 = 0x9,
79     CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE = 0xA,
80     CC3XX_PKA_OPCODE_SHR0 = 0xC,
81     CC3XX_PKA_OPCODE_SHR1 = 0xD,
82     CC3XX_PKA_OPCODE_SHL0 = 0xE,
83     CC3XX_PKA_OPCODE_SHL1 = 0xF,
84     CC3XX_PKA_OPCODE_MULLOW = 0x10,
85     CC3XX_PKA_OPCODE_MODMUL = 0x11,
86     CC3XX_PKA_OPCODE_MODMULN = 0x12,
87     CC3XX_PKA_OPCODE_MODEXP = 0x13,
88     CC3XX_PKA_OPCODE_DIV = 0x14,
89     /* Opcodes below here are not documented in the TRM. */
90     CC3XX_PKA_OPCODE_MODINV = 0x15,
91     CC3XX_PKA_OPCODE_MODDIV = 0x16,
92     CC3XX_PKA_OPCODE_MULHIGH = 0x17U,
93     CC3XX_PKA_OPCODE_MODMLAC = 0x18U,
94     CC3XX_PKA_OPCODE_MODMLACNR = 0x19U,
95     CC3XX_PKA_OPCODE_SEPINT = 0x1AU,
96     CC3XX_PKA_OPCODE_REDUCTION = 0x1BU,
97 };
98 
99 /* It seems strange that the state that is external is so small, while things
100  * like the virtual register allocations are internal to the implementation and
101  * therefore not saved in a get_state/set_state operation. In reality,
102  * recalculating the sram addresses is fast, and saving them has downsides
103  * related to the temporary register sram address swapping, so this is a
104  * reasonable approach.
105  */
106 static uint32_t pka_reg_am_max;
107 uint32_t phys_reg_next_mapped;
108 static uint32_t virt_reg_sram_addr[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
109 #ifdef CC3XX_CONFIG_PKA_ALIGN_FOR_PERFORMANCE
110 static uint32_t virt_reg_in_use[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
111 static uint32_t virt_reg_is_mapped[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
112 static uint32_t virt_reg_needs_n_mask[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
113 static uint32_t virt_reg_phys_reg[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
114 static uint32_t phys_reg_mapping_list[CC3XX_PKA_PHYS_REG_AMOUNT];
115 #else
116 static bool virt_reg_in_use[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
117 static bool virt_reg_is_mapped[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
118 static bool virt_reg_needs_n_mask[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
119 static uint8_t virt_reg_phys_reg[CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT];
120 static cc3xx_pka_reg_id_t phys_reg_mapping_list[CC3XX_PKA_PHYS_REG_AMOUNT];
121 #endif /* CC3XX_CONFIG_PKA_ALIGN_FOR_PERFORMANCE */
122 
123 static struct cc3xx_pka_state_t pka_state;
124 
pka_addr_from_byte_addr(uint32_t offset)125 static inline uint32_t pka_addr_from_byte_addr(uint32_t offset)
126 {
127     return offset / sizeof(uint32_t);
128 }
129 
pad_to_pka_word_size(uint32_t byte_size)130 static inline uint32_t pad_to_pka_word_size(uint32_t byte_size)
131 {
132     /* round up to the nearest PKA word */
133     return (((byte_size + PKA_WORD_SIZE - 1) / PKA_WORD_SIZE) * PKA_WORD_SIZE);
134 }
135 
cc3xx_lowlevel_pka_unmap_physical_registers(void)136 void cc3xx_lowlevel_pka_unmap_physical_registers(void)
137 {
138     uint32_t idx;
139     cc3xx_pka_reg_id_t virt_reg;
140 
141     /* Wait for the pipeline to finish */
142     while(!P_CC3XX->pka.pka_done){}
143 
144     for (idx = PKA_PHYS_REG_FIRST_MAPPABLE; idx <= PKA_PHYS_REG_LAST_MAPPABLE; idx++) {
145         virt_reg = phys_reg_mapping_list[idx];
146         if (virt_reg != 0 && virt_reg_is_mapped[virt_reg]) {
147             virt_reg_sram_addr[virt_reg] = P_CC3XX->pka.memory_map[idx];
148             virt_reg_phys_reg[virt_reg] = 0;
149             virt_reg_is_mapped[virt_reg] = false;
150         }
151 
152     }
153 
154     memset(phys_reg_mapping_list, 0, sizeof(phys_reg_mapping_list));
155 
156     for (idx = 0; idx < PKA_PHYS_REG_FIRST_MAPPABLE; idx++) {
157         phys_reg_mapping_list[idx] = idx;
158     }
159 
160     phys_reg_next_mapped = PKA_PHYS_REG_FIRST_MAPPABLE;
161 }
162 
pka_init_from_state(void)163 static void pka_init_from_state(void)
164 {
165     uint32_t idx;
166 
167     P_CC3XX->misc.pka_clk_enable = 1;
168     P_CC3XX->pka.pka_sw_reset = 1;
169 
170     /* Wait for SW reset to complete before proceeding */
171     while(!P_CC3XX->pka.pka_done) {}
172 
173     /* The TRM says that this register is a byte-size, but it is in fact a
174      * bit-size.
175      */
176     P_CC3XX->pka.pka_l[PKA_OP_SIZE_REGISTER] = pka_state.reg_size * 8;
177 
178     assert((pka_state.reg_size & ((PKA_WORD_SIZE) - 1)) == 0);
179     assert(pka_state.reg_size >= (PKA_WORD_SIZE));
180 
181     pka_reg_am_max = (PKA_SRAM_SIZE) / pka_state.reg_size;
182     if (pka_reg_am_max >= CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT + 2) {
183         pka_reg_am_max = CC3XX_CONFIG_PKA_MAX_VIRT_REG_AMOUNT + 2;
184     }
185 
186     /* We need to allocate 4 special registers (and have at least 1 left for an
187      * operation).
188      */
189     assert(pka_reg_am_max > 4);
190 
191     /* Unmap all the physical registers */
192     cc3xx_lowlevel_pka_unmap_physical_registers();
193 
194     /* Set up the first three regions as N and Np, and N_mask. These are
195      * special, so map them now.
196      */
197     for (idx = 0; idx < PKA_PHYS_REG_FIRST_MAPPABLE; idx++) {
198         virt_reg_is_mapped[idx] = true;
199         virt_reg_phys_reg[idx] = idx;
200         P_CC3XX->pka.memory_map[idx] =
201             pka_addr_from_byte_addr(pka_state.reg_size * idx);
202         virt_reg_sram_addr[idx] =
203             pka_addr_from_byte_addr(pka_state.reg_size * idx);
204     }
205 
206     /* Then reserve all but two regions for the general purpose registers */
207     for (; idx < pka_reg_am_max - 2; idx++) {
208         virt_reg_sram_addr[idx] =
209             pka_addr_from_byte_addr(pka_state.reg_size * idx);
210         virt_reg_is_mapped[idx] = 0;
211         virt_reg_phys_reg[idx] = 0;
212     }
213 
214     P_CC3XX->pka.memory_map[PKA_PHYS_REG_TEMP_0] =
215         pka_addr_from_byte_addr(pka_state.reg_size * idx);
216 
217     idx++;
218 
219     P_CC3XX->pka.memory_map[PKA_PHYS_REG_TEMP_1] =
220         pka_addr_from_byte_addr(pka_state.reg_size * idx);
221 
222     /* We don't count the temporary registers in reg_am_max, since it's used for
223      * verifying parameters of functions, and these should never be used as
224      * parameters */
225     pka_reg_am_max -= 2;
226 }
227 
cc3xx_lowlevel_pka_init(uint32_t size)228 void cc3xx_lowlevel_pka_init(uint32_t size)
229 {
230     cc3xx_lowlevel_pka_uninit();
231 
232     /* Minimum size is 16 bytes (128 bits), but just transparently increase it
233      * if needed
234      */
235     if (size < 16) {
236         size = 16;
237     }
238 
239     /* Max size of an operation is 256 bytes (2048 bits). The actual max size is
240      * 2112 bits, but 64 bits of overflow are required. */
241     assert(size <= 256);
242 
243     /* Calculate the register size based on the requested operation size + the
244      * size by which operations can overflow */
245     pka_state.reg_size = pad_to_pka_word_size(size + PKA_MAX_OVERFLOW_SIZE);
246     pka_state.virt_reg_next_mapped = PKA_VIRT_REG_FIRST_ALLOCATABLE;
247 
248     pka_init_from_state();
249 }
250 
allocate_phys_reg(cc3xx_pka_reg_id_t virt_reg)251 static void allocate_phys_reg(cc3xx_pka_reg_id_t virt_reg)
252 {
253     uint32_t phys_reg;
254 
255     assert(phys_reg_next_mapped <= PKA_PHYS_REG_LAST_MAPPABLE);
256     assert(phys_reg_mapping_list[PKA_PHYS_REG_TEMP_0] == 0);
257     assert(phys_reg_mapping_list[PKA_PHYS_REG_TEMP_1] == 0);
258 
259     phys_reg = phys_reg_next_mapped;
260     phys_reg_next_mapped += 1;
261 
262     while(!P_CC3XX->pka.pka_done) {}
263     P_CC3XX->pka.memory_map[phys_reg] = virt_reg_sram_addr[virt_reg];
264     while(!P_CC3XX->pka.pka_done) {}
265 
266     phys_reg_mapping_list[phys_reg] = virt_reg;
267     virt_reg_is_mapped[virt_reg] = true;
268     virt_reg_phys_reg[virt_reg] = phys_reg;
269 }
270 
cc3xx_lowlevel_pka_allocate_reg(void)271 cc3xx_pka_reg_id_t cc3xx_lowlevel_pka_allocate_reg(void)
272 {
273     cc3xx_pka_reg_id_t reg_id = 0;
274 
275     reg_id = pka_state.virt_reg_next_mapped;
276     assert(reg_id != pka_reg_am_max);
277 
278     pka_state.virt_reg_next_mapped += 1;
279 
280     virt_reg_in_use[reg_id] = true;
281 
282     return reg_id;
283 }
284 
285 /* To make this faster, it's only possible to free the most recently allocated
286  * register. Register freeing must match this pattern.
287  */
cc3xx_lowlevel_pka_free_reg(cc3xx_pka_reg_id_t reg_id)288 void cc3xx_lowlevel_pka_free_reg(cc3xx_pka_reg_id_t reg_id)
289 {
290     assert(reg_id == pka_state.virt_reg_next_mapped - 1);
291     assert(virt_reg_in_use[reg_id]);
292 
293     pka_state.virt_reg_next_mapped -= 1;
294 
295     virt_reg_in_use[reg_id] = false;
296 }
297 
ensure_virt_reg_is_mapped(cc3xx_pka_reg_id_t reg_id)298 static void CC3XX_ATTRIBUTE_INLINE ensure_virt_reg_is_mapped(cc3xx_pka_reg_id_t reg_id)
299 {
300     assert(reg_id <= pka_reg_am_max);
301 
302     if (!virt_reg_is_mapped[reg_id]) {
303         allocate_phys_reg(reg_id);
304     }
305 }
306 
pka_write_reg(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len,bool swap_endian)307 static void pka_write_reg(cc3xx_pka_reg_id_t reg_id, const uint32_t *data,
308                           size_t len, bool swap_endian)
309 {
310     size_t idx;
311 
312     /* Check alignment */
313     assert(((uintptr_t)data & (sizeof(uint32_t) - 1)) == 0);
314     /* Check length */
315     assert((len & (sizeof(uint32_t) - 1)) == 0);
316 
317 
318     /* Check slot */
319     assert(reg_id < pka_reg_am_max);
320     assert(virt_reg_in_use[reg_id]);
321     assert(len <= pka_state.reg_size);
322 
323     /* clear the register, so we don't have to explicitly write the upper words
324      */
325     cc3xx_lowlevel_pka_clear(reg_id);
326 
327     /* Make sure we have a physical register mapped for the virtual register */
328     ensure_virt_reg_is_mapped(reg_id);
329 
330     /* Wait for any outstanding operations to finish before performing reads or
331      * writes on the PKA SRAM
332      */
333     while(!P_CC3XX->pka.pka_done) {}
334     P_CC3XX->pka.pka_sram_addr =
335         P_CC3XX->pka.memory_map[virt_reg_phys_reg[reg_id]];
336     while(!P_CC3XX->pka.pka_done) {}
337 
338     /* Write data */
339     for (idx = 0; idx < len / sizeof(uint32_t); idx++) {
340         P_CC3XX->pka.pka_sram_wdata = swap_endian ? bswap_32(data[(len / sizeof(uint32_t) - 1) - idx])
341                                                   : data[idx];
342         while(!P_CC3XX->pka.pka_done) {}
343     }
344 }
345 
cc3xx_lowlevel_pka_write_reg_swap_endian(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len)346 void cc3xx_lowlevel_pka_write_reg_swap_endian(cc3xx_pka_reg_id_t reg_id, const uint32_t *data,
347                                      size_t len)
348 {
349     pka_write_reg(reg_id, (uint32_t *)data, len, true);
350 }
351 
cc3xx_lowlevel_pka_write_reg(cc3xx_pka_reg_id_t reg_id,const uint32_t * data,size_t len)352 void cc3xx_lowlevel_pka_write_reg(cc3xx_pka_reg_id_t reg_id, const uint32_t *data, size_t len)
353 {
354     pka_write_reg(reg_id, data, len, false);
355 }
356 
pka_read_reg(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len,bool swap_endian)357 static void pka_read_reg(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len,
358                          bool swap_endian)
359 {
360     size_t idx;
361 
362     /* Check alignment */
363     assert(((uintptr_t)data & (sizeof(uint32_t) - 1)) == 0);
364     /* Check length */
365     assert((len & (sizeof(uint32_t) - 1)) == 0);
366 
367     /* Check slot */
368     assert(reg_id < pka_reg_am_max);
369     assert(virt_reg_in_use[reg_id]);
370     assert(len <= pka_state.reg_size);
371 
372     /* Make sure we have a physical register mapped for the virtual register */
373     ensure_virt_reg_is_mapped(reg_id);
374 
375     /* The PKA registers can be remapped by the hardware (by swapping value
376      * values of the memory_map registers), so we need to read the memory_map
377      * register to find the correct address.
378      */
379     while(!P_CC3XX->pka.pka_done) {}
380     P_CC3XX->pka.pka_sram_raddr =
381         P_CC3XX->pka.memory_map[virt_reg_phys_reg[reg_id]];
382     while(!P_CC3XX->pka.pka_done) {}
383 
384     /* Read data */
385     for (idx = 0; idx < len / sizeof(uint32_t); idx++) {
386         if (swap_endian) {
387             data[(len / sizeof(uint32_t) -1) - idx] = bswap_32(P_CC3XX->pka.pka_sram_rdata);
388         } else {
389             data[idx] = P_CC3XX->pka.pka_sram_rdata;
390         }
391     }
392 }
393 
cc3xx_lowlevel_pka_read_reg(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len)394 void cc3xx_lowlevel_pka_read_reg(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len)
395 {
396     pka_read_reg(reg_id, data, len, false);
397 }
398 
cc3xx_lowlevel_pka_read_reg_swap_endian(cc3xx_pka_reg_id_t reg_id,uint32_t * data,size_t len)399 void cc3xx_lowlevel_pka_read_reg_swap_endian(cc3xx_pka_reg_id_t reg_id, uint32_t *data, size_t len)
400 {
401     pka_read_reg(reg_id, (uint32_t *)data, len, true);
402 }
403 
404 /* Calculate the Barrett Tag (https://en.wikipedia.org/wiki/Barrett_reduction)
405  * to enable reduction modulo N. If this tag is not calulated, reduction
406  * operations will fail. doi:10.1007/3-540-47721-7_24 is good reference.
407  *
408  * We are attempting to calculate 2^k / N. In the reference the value k = 2 * n
409  * where n is the bit-length of N is chosen due to the max value to be reduced
410  * being representable in 2 * n bits. In the previous driver, instead k = n + 64
411  * (which is the PKA word size), which means the max value to be reduced must be
412  * representable in n + 64 bits. It is assumed, but not certain, that this holds
413  * because of how the reduction in hardware is being calculated.
414  */
calc_Np(void)415 static inline void calc_Np(void)
416 {
417     cc3xx_pka_reg_id_t reg_temp_0 = cc3xx_lowlevel_pka_allocate_reg();
418     cc3xx_pka_reg_id_t reg_temp_1 = cc3xx_lowlevel_pka_allocate_reg();
419     uint32_t N_bit_size = cc3xx_lowlevel_pka_get_bit_size(CC3XX_PKA_REG_N);
420     uint32_t power;
421 
422     /* If N is large, we perform a special-case operation to avoid having to
423      * generate 2^n, which may be large. In this case, we first divide N by
424      * 2^(N_bit_size2 * PKA_WORD_BIT_SIZE) and then divide the constant 2^(3 *
425      * PKA_WORD_SIZE) by the result, meaning the largest number we need to
426      * synthesize in a register is 2^(3 * PKA_WORD_BIT_SIZE). This is done so
427      * that if the modulus size is the maximum 2048 bits, then the largest
428      * synthesized number fits into the 2112 bit register+overflow size.
429      */
430     if (N_bit_size > PKA_MAX_OVERFLOW_BIT_SIZE * 2) {
431         power = PKA_MAX_OVERFLOW_BIT_SIZE * 3 - 1;
432         cc3xx_lowlevel_pka_set_to_power_of_two(reg_temp_0, power);
433 
434         /* Divide N by 2^(N_bit_size - 2 * PKA_MAX_OVERFLOW_BIT_SIZE) */
435         power = N_bit_size - 2 * PKA_MAX_OVERFLOW_BIT_SIZE;
436         cc3xx_lowlevel_pka_shift_right_fill_0_ui(CC3XX_PKA_REG_N, power, reg_temp_1);
437 
438         /* Ceiling */
439         cc3xx_lowlevel_pka_add_si(reg_temp_1, 1, reg_temp_1);
440         cc3xx_lowlevel_pka_div(reg_temp_0, reg_temp_1, CC3XX_PKA_REG_NP, reg_temp_1);
441     } else {
442         /* set r0 to 2^(N_bit_size + PKA_WORD_SIZE - 1) */
443         power = N_bit_size + PKA_MAX_OVERFLOW_BIT_SIZE - 1;
444         cc3xx_lowlevel_pka_set_to_power_of_two(reg_temp_0, power);
445 
446         /* Finally, perform the division */
447         cc3xx_lowlevel_pka_div(reg_temp_0, CC3XX_PKA_REG_N, CC3XX_PKA_REG_NP, reg_temp_1);
448     }
449 
450     cc3xx_lowlevel_pka_free_reg(reg_temp_1);
451     cc3xx_lowlevel_pka_free_reg(reg_temp_0);
452 }
453 
cc3xx_lowlevel_pka_set_modulus(cc3xx_pka_reg_id_t modulus,bool calculate_tag,cc3xx_pka_reg_id_t barrett_tag)454 void cc3xx_lowlevel_pka_set_modulus(cc3xx_pka_reg_id_t modulus, bool calculate_tag,
455                                     cc3xx_pka_reg_id_t barrett_tag)
456 {
457     uint32_t N_bit_size;
458 
459     assert(modulus < pka_reg_am_max);
460     assert(virt_reg_in_use[modulus]);
461 
462     virt_reg_in_use[CC3XX_PKA_REG_N] = true;
463     cc3xx_lowlevel_pka_copy(modulus, CC3XX_PKA_REG_N);
464 
465     /* This operation size must correspond exactly to the bit-size of the
466      * modulus, so a bit-counting operation is performed.
467      */
468     N_bit_size = cc3xx_lowlevel_pka_get_bit_size(CC3XX_PKA_REG_N);
469     P_CC3XX->pka.pka_l[PKA_OP_SIZE_N] = N_bit_size;
470 
471     virt_reg_in_use[CC3XX_PKA_REG_N_MASK] = true;
472     cc3xx_lowlevel_pka_set_to_power_of_two(CC3XX_PKA_REG_N_MASK, N_bit_size);
473     cc3xx_lowlevel_pka_sub_si(CC3XX_PKA_REG_N_MASK, 1, CC3XX_PKA_REG_N_MASK);
474 
475 #ifndef CC3XX_CONFIG_PKA_CALC_NP_ENABLE
476     assert(!calculate_tag);
477 #endif /* !CC3XX_CONFIG_PKA_CALC_NP_ENABLE */
478 
479     virt_reg_in_use[CC3XX_PKA_REG_NP] = true;
480     if (calculate_tag) {
481 #ifdef CC3XX_CONFIG_PKA_CALC_NP_ENABLE
482         calc_Np();
483 #endif /* CC3XX_CONFIG_PKA_CALC_NP_ENABLE */
484     } else {
485         assert(barrett_tag < pka_reg_am_max);
486         assert(virt_reg_in_use[barrett_tag]);
487 
488         cc3xx_lowlevel_pka_copy(barrett_tag, CC3XX_PKA_REG_NP);
489     }
490 }
491 
cc3xx_lowlevel_pka_get_state(struct cc3xx_pka_state_t * state,uint32_t save_reg_am,cc3xx_pka_reg_id_t * save_reg_list,uint32_t ** save_reg_ptr_list,const size_t * save_reg_size_list)492 void cc3xx_lowlevel_pka_get_state(struct cc3xx_pka_state_t *state, uint32_t save_reg_am,
493                                   cc3xx_pka_reg_id_t *save_reg_list,
494                                   uint32_t **save_reg_ptr_list,
495                                   const size_t *save_reg_size_list)
496 {
497     size_t idx;
498     cc3xx_pka_reg_id_t reg_id;
499 
500     memcpy(state, &pka_state, sizeof(*state));
501 
502     for (idx = 0; idx < save_reg_am; idx++) {
503         reg_id = save_reg_list[idx];
504         assert(reg_id < pka_reg_am_max);
505         assert(virt_reg_in_use[reg_id]);
506 
507         cc3xx_lowlevel_pka_read_reg(reg_id, save_reg_ptr_list[idx], save_reg_size_list[idx]);
508     }
509 }
510 
cc3xx_lowlevel_pka_set_state(const struct cc3xx_pka_state_t * state,uint32_t load_reg_am,cc3xx_pka_reg_id_t * load_reg_list,const uint32_t ** load_reg_ptr_list,const size_t * load_reg_size_list)511 void cc3xx_lowlevel_pka_set_state(const struct cc3xx_pka_state_t *state,
512                                   uint32_t load_reg_am, cc3xx_pka_reg_id_t *load_reg_list,
513                                   const uint32_t **load_reg_ptr_list,
514                                   const size_t *load_reg_size_list)
515 {
516     size_t idx;
517     cc3xx_pka_reg_id_t reg_id;
518 
519     memcpy(&pka_state, state, sizeof(*state));
520 
521     pka_init_from_state();
522 
523     for (idx = 0; idx < load_reg_am; idx++) {
524         reg_id = load_reg_list[idx];
525         assert(reg_id < pka_reg_am_max);
526         assert(virt_reg_in_use[reg_id]);
527 
528         cc3xx_lowlevel_pka_write_reg(reg_id, load_reg_ptr_list[idx], load_reg_size_list[idx]);
529     }
530 }
531 
cc3xx_lowlevel_pka_uninit(void)532 void cc3xx_lowlevel_pka_uninit(void)
533 {
534     memset(&pka_state, 0, sizeof(pka_state));
535     memset(virt_reg_in_use, 0, sizeof(virt_reg_in_use));
536     memset(virt_reg_is_mapped, 0, sizeof(virt_reg_is_mapped));
537     memset(virt_reg_phys_reg, 0, sizeof(virt_reg_phys_reg));
538     memset(virt_reg_sram_addr, 0, sizeof(virt_reg_sram_addr));
539     memset(virt_reg_needs_n_mask, 0, sizeof(virt_reg_needs_n_mask));
540     memset(phys_reg_mapping_list, 0, sizeof(phys_reg_mapping_list));
541     phys_reg_next_mapped = 0;
542 
543     P_CC3XX->misc.pka_clk_enable = 0;
544 }
545 
opcode_construct(enum cc3xx_pka_operation_t op,enum pka_op_size_t size,bool r0_is_immediate,uint32_t r0,bool r1_is_immediate,uint32_t r1,bool discard_result,uint32_t res)546 static uint32_t CC3XX_ATTRIBUTE_INLINE opcode_construct(enum cc3xx_pka_operation_t op,
547                                                         enum pka_op_size_t size,
548                                                         bool r0_is_immediate, uint32_t r0,
549                                                         bool r1_is_immediate, uint32_t r1,
550                                                         bool discard_result, uint32_t res)
551 {
552     uint32_t opcode = 0;
553 
554     /* The tag part of the opcode register is designed to be used to debug PKA
555      * operations, but we don't use this functionality. For some of the opcodes
556      * that aren't documented in the TRM, this is used as a third register
557      * input.
558      */
559     /* opcode |= r3 & 0b11111; */
560 
561     /* The top bit of the output register select is a field which if set
562      * prevents the operation writing the output register (or more accurately,
563      * prevents the swapping of the virtual address of the output register and
564      * the temporary register). The pka_status register is still set, so flags
565      * such as the sign of the result can still be used.
566      */
567     if (!discard_result) {
568         assert(res >= 0);
569         assert(res < pka_reg_am_max);
570         assert(virt_reg_in_use[res]);
571         /* Make sure we have a physical register mapped for the virtual register */
572         ensure_virt_reg_is_mapped(res);
573         opcode |= (virt_reg_phys_reg[res] & 0b11111) << 6;
574     } else {
575         opcode |= (discard_result & 0b1) << 11;
576     }
577 
578     /* The top bit of the REG_A field is a toggle between being a register ID
579      * and an immediate, and the lower 5 bits give us either a 0-31 register ID,
580      * a -16-15 signed immediate or a 0-31 unsigned immediate depending on the
581      * operation.
582      */
583     if (r1_is_immediate) {
584         opcode |= (r1_is_immediate & 0b1) << 17;
585         opcode |= (r1 & 0b11111) << 12;
586     } else {
587         assert(r1 >= 0);
588         assert(r1 < pka_reg_am_max);
589         assert(virt_reg_in_use[r1]);
590         /* Make sure we have a physical register mapped for the virtual register */
591         ensure_virt_reg_is_mapped(r1);
592         opcode |= (virt_reg_phys_reg[r1] & 0b11111) << 12;
593     }
594 
595     /* For unclear reasons, the immediate (shift amount) for shift opcodes
596      * doesn't use the upper bit to denote that it isn't a register.
597      * Possibly because this opcode doesn't support register input?
598      */
599     if (op >= CC3XX_PKA_OPCODE_SHR0 && op <= CC3XX_PKA_OPCODE_SHL1) {
600         opcode &= ~(0b1 << 17);
601     }
602 
603     /* The top bit of the REG_B field is a toggle between being a register ID
604      * and an immediate, and the lower 5 bits give us either a 0-31 register ID,
605      * a -16-15 signed immediate or a 0-31 unsigned immediate depending on the
606      * operation.
607      */
608     if (r0_is_immediate) {
609         opcode |= (r0_is_immediate & 0b1) << 23;
610         opcode |= (r0 & 0b11111) << 18;
611     } else {
612         assert(r0 >= 0);
613         assert(r0 <= pka_reg_am_max);
614         assert(virt_reg_in_use[r0]);
615         /* Make sure we have a physical register mapped for the virtual register */
616         ensure_virt_reg_is_mapped(r0);
617         opcode |= (virt_reg_phys_reg[r0] & 0b11111) << 18;
618     }
619 
620     if (!r0_is_immediate) {
621         assert(virt_reg_is_mapped[r0]);
622     }
623 
624     if (!r1_is_immediate) {
625         assert(virt_reg_is_mapped[r1]);
626         if (!r0_is_immediate && r0 != r1) {
627             assert(virt_reg_phys_reg[r1] != virt_reg_phys_reg[r0]);
628         }
629     }
630 
631     if (!discard_result) {
632         assert(virt_reg_is_mapped[res]);
633         if (!r0_is_immediate && r0 != res) {
634             assert(virt_reg_phys_reg[res] != virt_reg_phys_reg[r0]);
635         }
636         if (!r1_is_immediate && r1 != res) {
637             assert(virt_reg_phys_reg[res] != virt_reg_phys_reg[r1]);
638         }
639     }
640 
641     /* Select which of the pka_l register is used for the bit-length of the
642      * operation.
643      */
644     opcode |= (size & 0b111) << 24;
645 
646     /* Set the actual operation */
647     opcode |= (op & 0b11111) << 27;
648 
649     /* Wait for a pipeline slot to be free before submitting this operation.
650      * Note that the previous operations may still be in progress at this point.
651      */
652     while(!P_CC3XX->pka.pka_pipe_rdy) {}
653 
654     return opcode;
655 }
656 
cc3xx_lowlevel_pka_get_bit_size(cc3xx_pka_reg_id_t r0)657 uint32_t cc3xx_lowlevel_pka_get_bit_size(cc3xx_pka_reg_id_t r0)
658 {
659     int32_t idx;
660     uint32_t word;
661 
662     ensure_virt_reg_is_mapped(r0);
663 
664     /* This isn't an operation that can use the PKA pipeline, so we need to wait
665      * for the pipeline to be finished before reading the SRAM.
666      */
667     while(!P_CC3XX->pka.pka_done) {}
668 
669     for (idx = pka_state.reg_size - sizeof(uint32_t); idx >= 0;
670         idx -= sizeof(uint32_t)) {
671         P_CC3XX->pka.pka_sram_raddr =
672             P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] +
673             pka_addr_from_byte_addr(idx);
674         while(!P_CC3XX->pka.pka_done) {}
675 
676         word = P_CC3XX->pka.pka_sram_rdata;
677 
678         if (word) {
679             break;
680         }
681     }
682 
683     if (idx < 0) {
684         return 0;
685     } else {
686         return idx * 8 + (32 - __builtin_clz(word));
687     }
688 }
689 
cc3xx_lowlevel_pka_set_to_power_of_two(cc3xx_pka_reg_id_t r0,uint32_t power)690 void cc3xx_lowlevel_pka_set_to_power_of_two(cc3xx_pka_reg_id_t r0, uint32_t power)
691 {
692     uint32_t final_word = 1 << (power % (sizeof(uint32_t) * 8));
693     uint32_t word_offset = power / (8 * sizeof(uint32_t));
694 
695     cc3xx_lowlevel_pka_clear(r0);
696 
697     ensure_virt_reg_is_mapped(r0);
698 
699     /* This isn't an operation that can use the PKA pipeline, so we need to wait
700      * for the pipeline to be finished before reading the SRAM.
701      */
702     while(!P_CC3XX->pka.pka_done) {}
703 
704     P_CC3XX->pka.pka_sram_addr =
705         P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] + word_offset;
706     while(!P_CC3XX->pka.pka_done) {}
707 
708     P_CC3XX->pka.pka_sram_wdata = final_word;
709     while(!P_CC3XX->pka.pka_done) {}
710 }
711 
712 #ifdef CC3XX_CONFIG_RNG_ENABLE
cc3xx_lowlevel_pka_set_to_random(cc3xx_pka_reg_id_t r0,size_t bit_len)713 cc3xx_err_t cc3xx_lowlevel_pka_set_to_random(cc3xx_pka_reg_id_t r0, size_t bit_len)
714 {
715     uint32_t byte_size = (bit_len + 7) / 8;
716     uint32_t word_size = (byte_size + 3) / sizeof(uint32_t);
717     uint32_t random_buf[word_size];
718     cc3xx_err_t err;
719 
720     err = cc3xx_lowlevel_rng_get_random((uint8_t *)random_buf, word_size * sizeof(uint32_t));
721     if (err != CC3XX_ERR_SUCCESS) {
722         return err;
723     }
724 
725     /* Take off any extra bits */
726     random_buf[word_size - 1] = random_buf[word_size - 1] >> (32 - (bit_len % 32));
727 
728     cc3xx_lowlevel_pka_write_reg(r0, random_buf, sizeof(random_buf));
729 
730     return CC3XX_ERR_SUCCESS;
731 }
732 
cc3xx_lowlevel_pka_set_to_random_within_modulus(cc3xx_pka_reg_id_t r0)733 cc3xx_err_t cc3xx_lowlevel_pka_set_to_random_within_modulus(cc3xx_pka_reg_id_t r0)
734 {
735     cc3xx_err_t err;
736     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
737 
738     do {
739         /* This uses the simple discard method from SP800-90A, because the modular
740          * methods are impractical due to the pka_reduce function not working for
741          * numbers significantly greater than OP_SIZE_N.
742          */
743         err = cc3xx_lowlevel_pka_set_to_random(r0, P_CC3XX->pka.pka_l[PKA_OP_SIZE_N]);
744         if (err != CC3XX_ERR_SUCCESS) {
745             return err;
746         }
747     } while (!cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
748 
749     return CC3XX_ERR_SUCCESS;
750 }
751 #endif /* CC3XX_CONFIG_RNG_ENABLE */
752 
cc3xx_lowlevel_pka_add(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)753 void cc3xx_lowlevel_pka_add(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
754 {
755     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_ADD_INC,
756                                            PKA_OP_SIZE_REGISTER,
757                                            false, r0, false, r1, false, res);
758 }
759 
cc3xx_lowlevel_pka_add_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)760 void cc3xx_lowlevel_pka_add_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
761 {
762     assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
763     assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
764 
765     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_ADD_INC,
766                                            PKA_OP_SIZE_REGISTER,
767                                            false, r0, true, imm, false, res);
768 }
769 
cc3xx_lowlevel_pka_sub(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)770 void cc3xx_lowlevel_pka_sub(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
771 {
772     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
773                                            PKA_OP_SIZE_REGISTER,
774                                            false, r0, false, r1, false, res);
775 }
776 
cc3xx_lowlevel_pka_sub_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)777 void cc3xx_lowlevel_pka_sub_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
778 {
779     assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
780     assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
781 
782     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
783                                            PKA_OP_SIZE_REGISTER,
784                                            false, r0, true, imm, false, res);
785 }
786 
cc3xx_lowlevel_pka_neg(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)787 void cc3xx_lowlevel_pka_neg(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
788 {
789     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
790                                            PKA_OP_SIZE_REGISTER,
791                                            true, 0, false, r0, false, res);
792 }
793 
cc3xx_lowlevel_pka_mod_add(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)794 void cc3xx_lowlevel_pka_mod_add(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
795 {
796     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
797     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
798     assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
799 
800     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODADD_MODINC,
801                                            PKA_OP_SIZE_REGISTER,
802                                            false, r0, false, r1, false, res);
803 }
804 
cc3xx_lowlevel_pka_mod_add_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)805 void cc3xx_lowlevel_pka_mod_add_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
806 {
807     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
808 
809     assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
810     assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
811 
812     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
813     assert(cc3xx_lowlevel_pka_greater_than_si(CC3XX_PKA_REG_N, imm));
814 
815     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODADD_MODINC,
816                                            PKA_OP_SIZE_REGISTER,
817                                            false, r0, true, imm, false, res);
818 }
819 
cc3xx_lowlevel_pka_mod_sub(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)820 void cc3xx_lowlevel_pka_mod_sub(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
821 {
822     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
823     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
824     assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
825 
826     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
827                                            PKA_OP_SIZE_REGISTER,
828                                            false, r0, false, r1, false, res);
829 }
830 
cc3xx_lowlevel_pka_mod_sub_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)831 void cc3xx_lowlevel_pka_mod_sub_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
832 {
833     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
834 
835     assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
836     assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
837 
838     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
839     assert(cc3xx_lowlevel_pka_greater_than_si(CC3XX_PKA_REG_N, imm));
840 
841     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
842                                            PKA_OP_SIZE_REGISTER,
843                                            false, r0, true, imm, false, res);
844 }
845 
cc3xx_lowlevel_pka_mod_neg(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)846 void cc3xx_lowlevel_pka_mod_neg(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
847 {
848     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
849     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
850 
851     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODSUB_MODDEC_MODNEG,
852                                            PKA_OP_SIZE_REGISTER,
853                                            true, 0, false, r0, false, res);
854 }
855 
cc3xx_lowlevel_pka_and(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)856 void cc3xx_lowlevel_pka_and(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
857 {
858     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
859                                            PKA_OP_SIZE_REGISTER,
860                                            false, r0, false, r1, false, res);
861 }
862 
cc3xx_lowlevel_pka_and_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)863 void cc3xx_lowlevel_pka_and_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
864 {
865     assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
866 
867     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
868                                            PKA_OP_SIZE_REGISTER,
869                                            false, r0, true, mask, false, res);
870 }
871 
cc3xx_lowlevel_pka_test_bits_ui(cc3xx_pka_reg_id_t r0,uint32_t idx,uint32_t bit_am)872 uint32_t cc3xx_lowlevel_pka_test_bits_ui(cc3xx_pka_reg_id_t r0, uint32_t idx, uint32_t bit_am)
873 {
874     uint32_t bits;
875     uint32_t word_offset = idx / (8 * sizeof(uint32_t));
876 
877     assert(bit_am <= 4);
878     /* This prevents us from needing to read two words */
879     assert(idx % bit_am == 0);
880 
881     ensure_virt_reg_is_mapped(r0);
882 
883     while(!P_CC3XX->pka.pka_done) {}
884     P_CC3XX->pka.pka_sram_raddr =
885         P_CC3XX->pka.memory_map[virt_reg_phys_reg[r0]] + word_offset;
886     while(!P_CC3XX->pka.pka_done) {}
887 
888     bits = (P_CC3XX->pka.pka_sram_rdata >> (idx % 32)) & ((1 << bit_am) - 1);
889     while(!P_CC3XX->pka.pka_done) {}
890 
891     /* Return the inverted value of ALU_OUT_ZERO */
892     return bits;
893 }
894 
cc3xx_lowlevel_pka_clear_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)895 void cc3xx_lowlevel_pka_clear_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
896 {
897     /* Check that we can construct the required mask */
898     assert((0x1 << idx) <= PKA_MAX_UNSIGNED_IMMEDIATE);
899 
900     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
901                                            PKA_OP_SIZE_REGISTER,
902                                            false, r0, true, ~(1 << idx), false, res);
903 }
904 
cc3xx_lowlevel_pka_clear(cc3xx_pka_reg_id_t r0)905 void cc3xx_lowlevel_pka_clear(cc3xx_pka_reg_id_t r0)
906 {
907     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
908                                            PKA_OP_SIZE_REGISTER,
909                                            false, r0, true, 0, false, r0);
910 }
911 
cc3xx_lowlevel_pka_or(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)912 void cc3xx_lowlevel_pka_or(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
913 {
914     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
915                                            PKA_OP_SIZE_REGISTER,
916                                            false, r0, false, r1, false, res);
917 }
918 
cc3xx_lowlevel_pka_or_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)919 void cc3xx_lowlevel_pka_or_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
920 {
921     assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
922 
923     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
924                                            PKA_OP_SIZE_REGISTER,
925                                            false, r0, true, mask, false, res);
926 }
927 
cc3xx_lowlevel_pka_copy(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)928 void cc3xx_lowlevel_pka_copy(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
929 {
930     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_OR_COPY_SET0,
931                                            PKA_OP_SIZE_REGISTER,
932                                            false, r0, true, 0, false, res);
933 }
934 
cc3xx_lowlevel_pka_set_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)935 void cc3xx_lowlevel_pka_set_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
936 {
937     assert(idx < 32);
938 
939     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_AND_TST0_CLR0,
940                                            PKA_OP_SIZE_REGISTER,
941                                            false, r0, true, 1 << idx, false, res);
942 }
943 
cc3xx_lowlevel_pka_xor(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)944 void cc3xx_lowlevel_pka_xor(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
945 {
946     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
947                                            PKA_OP_SIZE_REGISTER,
948                                            false, r0, false, r1, false, res);
949 }
950 
cc3xx_lowlevel_pka_xor_si(cc3xx_pka_reg_id_t r0,uint32_t mask,cc3xx_pka_reg_id_t res)951 void cc3xx_lowlevel_pka_xor_si(cc3xx_pka_reg_id_t r0, uint32_t mask, cc3xx_pka_reg_id_t res)
952 {
953     assert(mask <= PKA_MAX_UNSIGNED_IMMEDIATE);
954 
955     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
956                                            PKA_OP_SIZE_REGISTER,
957                                            false, r0, true, mask, false, res);
958 }
959 
cc3xx_lowlevel_pka_flip_bit(cc3xx_pka_reg_id_t r0,uint32_t idx,cc3xx_pka_reg_id_t res)960 void cc3xx_lowlevel_pka_flip_bit(cc3xx_pka_reg_id_t r0, uint32_t idx, cc3xx_pka_reg_id_t res)
961 {
962     assert(idx < 32);
963 
964     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
965                                            PKA_OP_SIZE_REGISTER,
966                                            false, r0, true, 1 << idx, false, res);
967 }
968 
cc3xx_lowlevel_pka_are_equal(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)969 bool cc3xx_lowlevel_pka_are_equal(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
970 {
971     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
972                                            PKA_OP_SIZE_REGISTER,
973                                            false, r0, false, r1, true, 0);
974 
975 
976     /* We need the pipeline to finish before we read the status register for the
977      * result.
978      */
979     while(!P_CC3XX->pka.pka_done) {}
980 
981     /* Return ALU_OUT_ZERO */
982     return P_CC3XX->pka.pka_status & (0b1 << 12);
983 }
984 
cc3xx_lowlevel_pka_are_equal_si(cc3xx_pka_reg_id_t r0,int32_t imm)985 bool cc3xx_lowlevel_pka_are_equal_si(cc3xx_pka_reg_id_t r0, int32_t imm)
986 {
987     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_XOR_FLIP0_INVERT_COMPARE,
988                                            PKA_OP_SIZE_REGISTER,
989                                            false, r0, true, imm, true, 0);
990 
991 
992     /* We need the pipeline to finish before we read the status register for the
993      * result.
994      */
995     while(!P_CC3XX->pka.pka_done) {}
996 
997     /* Return ALU_OUT_ZERO */
998     return P_CC3XX->pka.pka_status & (0b1 << 12);
999 }
1000 
cc3xx_lowlevel_pka_less_than(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)1001 bool cc3xx_lowlevel_pka_less_than(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
1002 {
1003     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
1004                                            PKA_OP_SIZE_REGISTER,
1005                                            false, r0, false, r1, true, 0);
1006 
1007     /* Wait for the pipeline to be finished before reading the pka status
1008      * register.
1009      */
1010     while(!P_CC3XX->pka.pka_done) {}
1011 
1012     /* Return the value of ALU_SIGN_OUT */
1013     return P_CC3XX->pka.pka_status & (0b1 << 8);
1014 }
1015 
cc3xx_lowlevel_pka_less_than_si(cc3xx_pka_reg_id_t r0,int32_t imm)1016 bool cc3xx_lowlevel_pka_less_than_si(cc3xx_pka_reg_id_t r0, int32_t imm)
1017 {
1018     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SUB_DEC_NEG,
1019                                            PKA_OP_SIZE_REGISTER,
1020                                            false, r0, true, imm, true, 0);
1021 
1022     /* Wait for the pipeline to be finished before reading the pka status
1023      * register.
1024      */
1025     while(!P_CC3XX->pka.pka_done) {}
1026 
1027     /* Return the value of ALU_SIGN_OUT */
1028     return P_CC3XX->pka.pka_status & (0b1 << 8);
1029 }
1030 
cc3xx_lowlevel_pka_greater_than(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1)1031 bool cc3xx_lowlevel_pka_greater_than(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1)
1032 {
1033     return !cc3xx_lowlevel_pka_less_than(r0, r1)
1034         && !cc3xx_lowlevel_pka_are_equal(r0, r1);
1035 }
1036 
cc3xx_lowlevel_pka_greater_than_si(cc3xx_pka_reg_id_t r0,int32_t imm)1037 bool cc3xx_lowlevel_pka_greater_than_si(cc3xx_pka_reg_id_t r0, int32_t imm)
1038 {
1039     return !cc3xx_lowlevel_pka_less_than_si(r0, imm)
1040         && !cc3xx_lowlevel_pka_are_equal_si(r0, imm);
1041 }
1042 
cc3xx_lowlevel_pka_shift_right_fill_0_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1043 void cc3xx_lowlevel_pka_shift_right_fill_0_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1044 {
1045     uint32_t shift_am;
1046 
1047     /* The shift operations shifts by 1 more than the number requested, so for
1048      * the sake of sensible semantics we decrease the shift number by 1.
1049      * Shifting by 0 is technically reasonable, but we can decrease code-size by
1050      * disallowing it via this assert.
1051      */
1052 
1053     if (shift == 0) {
1054         cc3xx_lowlevel_pka_copy(r0, res);
1055     }
1056 
1057     while(shift > 0) {
1058         shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1059 
1060         P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHR0,
1061                                                PKA_OP_SIZE_REGISTER,
1062                                                false, r0, true, shift_am - 1, false, res);
1063         shift -= shift_am;
1064         r0 = res;
1065     }
1066 }
1067 
cc3xx_lowlevel_pka_shift_right_fill_1_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1068 void cc3xx_lowlevel_pka_shift_right_fill_1_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1069 {
1070     uint32_t shift_am;
1071 
1072     if (shift == 0) {
1073         cc3xx_lowlevel_pka_copy(r0, res);
1074     }
1075 
1076     while(shift > 0) {
1077         shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1078 
1079         P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHR1,
1080                                                PKA_OP_SIZE_REGISTER,
1081                                                false, r0, true, shift_am - 1, false, res);
1082         shift -= shift_am;
1083         r0 = res;
1084     }
1085 }
1086 
cc3xx_lowlevel_pka_shift_left_fill_0_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1087 void cc3xx_lowlevel_pka_shift_left_fill_0_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1088 {
1089     uint32_t shift_am;
1090 
1091     if (shift == 0) {
1092         cc3xx_lowlevel_pka_copy(r0, res);
1093     }
1094 
1095     while(shift > 0) {
1096         shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1097 
1098         P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHL0,
1099                                                PKA_OP_SIZE_REGISTER,
1100                                                false, r0, true, shift_am - 1, false, res);
1101         shift -= shift_am;
1102         r0 = res;
1103     }
1104 }
1105 
cc3xx_lowlevel_pka_shift_left_fill_1_ui(cc3xx_pka_reg_id_t r0,uint32_t shift,cc3xx_pka_reg_id_t res)1106 void cc3xx_lowlevel_pka_shift_left_fill_1_ui(cc3xx_pka_reg_id_t r0, uint32_t shift, cc3xx_pka_reg_id_t res)
1107 {
1108     uint32_t shift_am;
1109 
1110     if (shift == 0) {
1111         cc3xx_lowlevel_pka_copy(r0, res);
1112     }
1113 
1114     while(shift > 0) {
1115         shift_am = shift <= (PKA_MAX_UNSIGNED_IMMEDIATE + 1) ? shift : (PKA_MAX_UNSIGNED_IMMEDIATE + 1);
1116 
1117         P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_SHL1,
1118                                                PKA_OP_SIZE_REGISTER,
1119                                                false, r0, true, shift_am - 1, false, res);
1120         shift -= shift_am;
1121         r0 = res;
1122     }
1123 }
1124 
cc3xx_lowlevel_pka_mul_low_half(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1125 void cc3xx_lowlevel_pka_mul_low_half(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1126 {
1127     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MULLOW,
1128                                            PKA_OP_SIZE_REGISTER,
1129                                            false, r0, false, r1, false, res);
1130 }
1131 
cc3xx_lowlevel_pka_mul_high_half(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1132 void cc3xx_lowlevel_pka_mul_high_half(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1133 {
1134     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MULHIGH,
1135                                            PKA_OP_SIZE_REGISTER,
1136                                            false, r0, false, r1, false, res);
1137 }
1138 
cc3xx_lowlevel_pka_div(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t quotient,cc3xx_pka_reg_id_t remainder)1139 void cc3xx_lowlevel_pka_div(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t quotient,
1140                    cc3xx_pka_reg_id_t remainder)
1141 {
1142     cc3xx_pka_reg_id_t temp_r0 = cc3xx_lowlevel_pka_allocate_reg();
1143     cc3xx_pka_reg_id_t temp_r1;
1144 
1145     /* Since the div operation uses r0 to store the remainder, and we want to
1146      * avoid clobbering input registers, perform a copy first.
1147      */
1148     cc3xx_lowlevel_pka_copy(r0, temp_r0);
1149 
1150     /* If r1 is also the quotient register, this produces no result. In this
1151      * case, copy to a temporary register.
1152      */
1153     if (r1 == quotient) {
1154         temp_r1 = cc3xx_lowlevel_pka_allocate_reg();
1155         cc3xx_lowlevel_pka_copy(r1, temp_r1);
1156     } else {
1157         temp_r1 = r1;
1158     }
1159 
1160     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_DIV,
1161                                            PKA_OP_SIZE_REGISTER,
1162                                            false, temp_r0, false, temp_r1,
1163                                            false, quotient);
1164 
1165     /* Now clobber the remainder register */
1166     cc3xx_lowlevel_pka_copy(temp_r0, remainder);
1167 
1168     if (temp_r1 != r1) {
1169         cc3xx_lowlevel_pka_free_reg(temp_r1);
1170     }
1171     cc3xx_lowlevel_pka_free_reg(temp_r0);
1172 }
1173 
cc3xx_lowlevel_pka_mod_mul(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1174 void cc3xx_lowlevel_pka_mod_mul(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1175 {
1176     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1177     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1178     assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
1179 
1180     /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1181      * because it performs reduction, since mod_add uses _REGISTER, but because
1182      * it does not use the ALU, but the special-purpose modular multiplier.
1183      */
1184     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODMUL,
1185                                            PKA_OP_SIZE_N,
1186                                            false, r0, false, r1, false, res);
1187 
1188 
1189     /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1190      * top words. Do a mask operation to clear these
1191      */
1192     cc3xx_lowlevel_pka_and(res, CC3XX_PKA_REG_N_MASK, res);
1193 }
1194 
cc3xx_lowlevel_pka_mod_mul_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)1195 void cc3xx_lowlevel_pka_mod_mul_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
1196 {
1197     cc3xx_pka_reg_id_t temp_reg = cc3xx_lowlevel_pka_allocate_reg();
1198 
1199     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1200     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1201 
1202     /* This operation doesn't work with negative numbers */
1203     assert(imm >= 0);
1204 
1205     /* temp_reg starts at 0, so this is effectively a set */
1206     cc3xx_lowlevel_pka_clear(temp_reg);
1207     cc3xx_lowlevel_pka_add_si(temp_reg, imm, temp_reg);
1208 
1209     cc3xx_lowlevel_pka_mod_mul(r0, temp_reg, res);
1210 
1211     cc3xx_lowlevel_pka_free_reg(temp_reg);
1212 }
1213 
cc3xx_lowlevel_pka_mod_exp(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t r1,cc3xx_pka_reg_id_t res)1214 void cc3xx_lowlevel_pka_mod_exp(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t r1, cc3xx_pka_reg_id_t res)
1215 {
1216     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1217     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1218     assert(cc3xx_lowlevel_pka_less_than(r1, CC3XX_PKA_REG_N));
1219 
1220     /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1221      * because it performs reduction, since mod_add uses _REGISTER, but because
1222      * it does not use the ALU, but the special-purpose modular multiplier.
1223      */
1224     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_MODEXP,
1225                                            PKA_OP_SIZE_N,
1226                                            false, r0, false, r1, false, res);
1227 
1228     /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1229      * top words. Do a mask operation to clear these
1230      */
1231     cc3xx_lowlevel_pka_and(res, CC3XX_PKA_REG_N_MASK, res);
1232 }
1233 
cc3xx_lowlevel_pka_mod_exp_si(cc3xx_pka_reg_id_t r0,int32_t imm,cc3xx_pka_reg_id_t res)1234 void cc3xx_lowlevel_pka_mod_exp_si(cc3xx_pka_reg_id_t r0, int32_t imm, cc3xx_pka_reg_id_t res)
1235 {
1236     cc3xx_pka_reg_id_t temp_reg = cc3xx_lowlevel_pka_allocate_reg();
1237 
1238     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1239     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1240     assert(imm <= PKA_MAX_SIGNED_IMMEDIATE);
1241     assert(imm >= PKA_MIN_SIGNED_IMMEDIATE);
1242 
1243     /* This operation doesn't work with negative numbers */
1244     assert(imm >= 0);
1245 
1246     /* temp_reg starts at 0, so this is effectively a set */
1247     cc3xx_lowlevel_pka_clear(temp_reg);
1248     cc3xx_lowlevel_pka_add_si(temp_reg, imm, temp_reg);
1249 
1250     cc3xx_lowlevel_pka_mod_exp(r0, temp_reg, res);
1251 
1252     cc3xx_lowlevel_pka_free_reg(temp_reg);
1253 }
1254 
cc3xx_lowlevel_pka_mod_inv(cc3xx_pka_reg_id_t r0,cc3xx_pka_reg_id_t res)1255 void cc3xx_lowlevel_pka_mod_inv(cc3xx_pka_reg_id_t r0, cc3xx_pka_reg_id_t res)
1256 {
1257     cc3xx_pka_reg_id_t n_minus_2 = cc3xx_lowlevel_pka_allocate_reg();
1258 
1259     /* Use the special-case Euler theorem  a^-1 = a^N-2 mod N */
1260     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1261     assert(cc3xx_lowlevel_pka_less_than(r0, CC3XX_PKA_REG_N));
1262 
1263     cc3xx_lowlevel_pka_sub_si(CC3XX_PKA_REG_N, 2, n_minus_2);
1264     cc3xx_lowlevel_pka_mod_exp(r0, n_minus_2, res);
1265 
1266     cc3xx_lowlevel_pka_free_reg(n_minus_2);
1267 }
1268 
cc3xx_lowlevel_pka_reduce(cc3xx_pka_reg_id_t r0)1269 void cc3xx_lowlevel_pka_reduce(cc3xx_pka_reg_id_t r0)
1270 {
1271     assert(virt_reg_in_use[CC3XX_PKA_REG_N]);
1272 
1273     /* This operation uses PKA_OP_SIZE_N, instead of _REGISTER. This is not
1274      * because it performs reduction, since mod_add uses _REGISTER, but because
1275      * it does not use the ALU, but the special-purpose modular multiplier.
1276      */
1277     P_CC3XX->pka.opcode = opcode_construct(CC3XX_PKA_OPCODE_REDUCTION,
1278                                            PKA_OP_SIZE_N,
1279                                            false, r0, false, 0, false, r0);
1280 
1281     /* Because this uses use OP_SIZE_N, it sometime leaves garbage bits in the
1282      * top words. Do a mask operation to clear these
1283      */
1284     cc3xx_lowlevel_pka_and(r0, CC3XX_PKA_REG_N_MASK, r0);
1285 }
1286