Lines Matching +full:1 +full:- +full:16

1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
8 /* included by aes-ce.S and aes-neon.S */
55 stp x29, x30, [sp, #-16]!
63 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
65 ST5( ld1 {v4.16b}, [x1], #16 )
67 st1 {v0.16b-v3.16b}, [x0], #64
68 ST5( st1 {v4.16b}, [x0], #16 )
74 ld1 {v0.16b}, [x1], #16 /* get next pt block */
76 st1 {v0.16b}, [x0], #16
77 subs w4, w4, #1
80 ldp x29, x30, [sp], #16
86 stp x29, x30, [sp, #-16]!
94 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
96 ST5( ld1 {v4.16b}, [x1], #16 )
98 st1 {v0.16b-v3.16b}, [x0], #64
99 ST5( st1 {v4.16b}, [x0], #16 )
105 ld1 {v0.16b}, [x1], #16 /* get next ct block */
107 st1 {v0.16b}, [x0], #16
108 subs w4, w4, #1
111 ldp x29, x30, [sp], #16
130 ld1 {v4.16b}, [x5] /* get iv */
132 mov w8, #14 /* AES-256: 14 rounds */
139 ld1 {v4.16b}, [x5] /* get iv */
145 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
148 eor v1.16b, v1.16b, v0.16b
150 eor v2.16b, v2.16b, v1.16b
152 eor v3.16b, v3.16b, v2.16b
154 st1 {v0.16b-v3.16b}, [x0], #64
155 mov v4.16b, v3.16b
161 ld1 {v0.16b}, [x1], #16 /* get next pt block */
162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
164 st1 {v4.16b}, [x0], #16
165 subs w4, w4, #1
168 st1 {v4.16b}, [x5] /* return iv */
174 stp x29, x30, [sp, #-16]!
177 ld1 {cbciv.16b}, [x5] /* get iv */
179 mov w8, #14 /* AES-256: 14 rounds */
185 stp x29, x30, [sp, #-16]!
188 ld1 {cbciv.16b}, [x5] /* get iv */
195 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
197 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
198 mov v5.16b, v0.16b
199 mov v6.16b, v1.16b
200 mov v7.16b, v2.16b
203 eor v0.16b, v0.16b, cbciv.16b
204 eor v1.16b, v1.16b, v5.16b
205 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
206 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
207 eor v2.16b, v2.16b, v6.16b
208 eor v3.16b, v3.16b, v7.16b
209 eor v4.16b, v4.16b, v5.16b
211 mov v4.16b, v0.16b
212 mov v5.16b, v1.16b
213 mov v6.16b, v2.16b
215 sub x1, x1, #16
216 eor v0.16b, v0.16b, cbciv.16b
217 eor v1.16b, v1.16b, v4.16b
218 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
219 eor v2.16b, v2.16b, v5.16b
220 eor v3.16b, v3.16b, v6.16b
222 st1 {v0.16b-v3.16b}, [x0], #64
223 ST5( st1 {v4.16b}, [x0], #16 )
229 ld1 {v1.16b}, [x1], #16 /* get next ct block */
230 mov v0.16b, v1.16b /* ...and copy to v0 */
232 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
233 mov cbciv.16b, v1.16b /* ct is next iv */
234 st1 {v0.16b}, [x0], #16
235 subs w4, w4, #1
238 st1 {cbciv.16b}, [x5] /* return iv */
239 ldp x29, x30, [sp], #16
254 sub x4, x4, #16
258 ld1 {v3.16b}, [x8]
259 ld1 {v4.16b}, [x9]
261 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
262 ld1 {v1.16b}, [x1]
264 ld1 {v5.16b}, [x5] /* get iv */
267 eor v0.16b, v0.16b, v5.16b /* xor with iv */
268 tbl v1.16b, {v1.16b}, v4.16b
271 eor v1.16b, v1.16b, v0.16b
272 tbl v0.16b, {v0.16b}, v3.16b
276 st1 {v0.16b}, [x4] /* overlapping stores */
277 st1 {v1.16b}, [x0]
283 sub x4, x4, #16
287 ld1 {v3.16b}, [x8]
288 ld1 {v4.16b}, [x9]
290 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
291 ld1 {v1.16b}, [x1]
293 ld1 {v5.16b}, [x5] /* get iv */
297 tbl v2.16b, {v0.16b}, v3.16b
298 eor v2.16b, v2.16b, v1.16b
300 tbx v0.16b, {v1.16b}, v4.16b
302 eor v0.16b, v0.16b, v5.16b /* xor with iv */
305 st1 {v2.16b}, [x4] /* overlapping stores */
306 st1 {v0.16b}, [x0]
340 stp x29, x30, [sp, #-16]!
344 ld1 {vctr.16b}, [IV]
349 * the 64-bit counter with the IV.
355 umov IV_PART, vctr.d[1]
368 * Set up the counter values in v0-v{MAX_STRIDE-1}.
372 * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
380 mov v0.16b, vctr.16b
381 mov v1.16b, vctr.16b
382 mov v2.16b, vctr.16b
383 mov v3.16b, vctr.16b
384 ST5( mov v4.16b, vctr.16b )
386 sub x6, CTR, #MAX_STRIDE - 1
387 sub x7, CTR, #MAX_STRIDE - 2
388 sub x8, CTR, #MAX_STRIDE - 3
389 sub x9, CTR, #MAX_STRIDE - 4
390 ST5( sub x10, CTR, #MAX_STRIDE - 5 )
403 .subsection 1
416 add x8, x8, #1
428 adr x16, 1f
441 1: b 2f
445 ins vctr.d[1], x7
446 sub x7, IV_PART, #MAX_STRIDE - 1
447 sub x8, IV_PART, #MAX_STRIDE - 2
448 sub x9, IV_PART, #MAX_STRIDE - 3
451 mov v1.d[1], x7
453 ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
454 mov v2.d[1], x8
456 mov v3.d[1], x9
457 ST5( mov v4.d[1], x10 )
465 ld1 {v5.16b-v7.16b}, [IN], #48
468 eor v0.16b, v5.16b, v0.16b
469 ST4( ld1 {v5.16b}, [IN], #16 )
470 eor v1.16b, v6.16b, v1.16b
471 ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
472 eor v2.16b, v7.16b, v2.16b
473 eor v3.16b, v5.16b, v3.16b
474 ST5( eor v4.16b, v6.16b, v4.16b )
475 st1 {v0.16b-v3.16b}, [OUT], #64
476 ST5( st1 {v4.16b}, [OUT], #16 )
482 st1 {vctr.16b}, [IV] /* return next CTR value */
484 ldp x29, x30, [sp], #16
489 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
491 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
500 mov x16, #16
504 ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
506 cmp BYTES_W, #48 - (MAX_STRIDE << 4)
508 cmp BYTES_W, #32 - (MAX_STRIDE << 4)
510 cmp BYTES_W, #16 - (MAX_STRIDE << 4)
516 ST5( ld1 {v5.16b}, [IN], x14 )
517 ld1 {v6.16b}, [IN], x15
518 ld1 {v7.16b}, [IN], x16
523 ld1 {v8.16b}, [IN], x13
524 ld1 {v9.16b}, [IN]
525 ld1 {v10.16b}, [x9]
527 ST4( eor v6.16b, v6.16b, v0.16b )
528 ST4( eor v7.16b, v7.16b, v1.16b )
529 ST4( tbl v3.16b, {v3.16b}, v10.16b )
530 ST4( eor v8.16b, v8.16b, v2.16b )
531 ST4( eor v9.16b, v9.16b, v3.16b )
533 ST5( eor v5.16b, v5.16b, v0.16b )
534 ST5( eor v6.16b, v6.16b, v1.16b )
535 ST5( tbl v4.16b, {v4.16b}, v10.16b )
536 ST5( eor v7.16b, v7.16b, v2.16b )
537 ST5( eor v8.16b, v8.16b, v3.16b )
538 ST5( eor v9.16b, v9.16b, v4.16b )
540 ST5( st1 {v5.16b}, [OUT], x14 )
541 st1 {v6.16b}, [OUT], x15
542 st1 {v7.16b}, [OUT], x16
544 st1 {v9.16b}, [x13] // overlapping stores
545 st1 {v8.16b}, [OUT]
550 * Handle <= 16 bytes of plaintext
552 * This code always reads and writes 16 bytes. To avoid out of bounds
554 * encrypting/decrypting less than 16 bytes.
558 * This causes unusual behaviour when encrypting/decrypting less than 16
563 sub x8, x7, #16
567 ld1 {v5.16b}, [IN]
568 ld1 {v6.16b}, [OUT]
569 ST5( mov v3.16b, v4.16b )
571 ld1 {v10.16b-v11.16b}, [x9]
572 tbl v3.16b, {v3.16b}, v10.16b
573 sshr v11.16b, v11.16b, #7
574 eor v5.16b, v5.16b, v3.16b
575 bif v5.16b, v6.16b, v11.16b
576 st1 {v5.16b}, [OUT]
599 * The input and output buffers must always be at least 16 bytes even if
600 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
602 * to be at the end of this 16-byte temporary buffer rather than the
614 * The input and output buffers must always be at least 16 bytes even if
615 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
617 * to be at the end of this 16-byte temporary buffer rather than the
622 ctr_encrypt 1
635 and \tmp\().16b, \tmp\().16b, xtsmask.16b
637 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
638 eor \out\().16b, \out\().16b, \tmp\().16b
648 stp x29, x30, [sp, #-16]!
651 ld1 {v4.16b}, [x6]
668 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
670 eor v0.16b, v0.16b, v4.16b
672 eor v1.16b, v1.16b, v5.16b
673 eor v2.16b, v2.16b, v6.16b
675 eor v3.16b, v3.16b, v7.16b
677 eor v3.16b, v3.16b, v7.16b
678 eor v0.16b, v0.16b, v4.16b
679 eor v1.16b, v1.16b, v5.16b
680 eor v2.16b, v2.16b, v6.16b
681 st1 {v0.16b-v3.16b}, [x0], #64
682 mov v4.16b, v7.16b
689 subs w4, w4, #16
692 ld1 {v0.16b}, [x1], #16
694 eor v0.16b, v0.16b, v4.16b
696 eor v0.16b, v0.16b, v4.16b
698 subs w4, w4, #16
701 st1 {v0.16b}, [x0], #16
704 st1 {v0.16b}, [x0]
706 st1 {v4.16b}, [x6]
707 ldp x29, x30, [sp], #16
711 mov v0.16b, v3.16b
712 sub x0, x0, #16
717 add w4, w4, #16 /* # bytes in final block */
723 ld1 {v1.16b}, [x1] /* load final block */
724 ld1 {v2.16b}, [x8]
725 ld1 {v3.16b}, [x9]
727 tbl v2.16b, {v0.16b}, v2.16b
728 tbx v0.16b, {v1.16b}, v3.16b
729 st1 {v2.16b}, [x4] /* overlapping stores */
735 stp x29, x30, [sp, #-16]!
738 /* subtract 16 bytes if we are doing CTS */
743 ld1 {v4.16b}, [x6]
761 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
763 eor v0.16b, v0.16b, v4.16b
765 eor v1.16b, v1.16b, v5.16b
766 eor v2.16b, v2.16b, v6.16b
768 eor v3.16b, v3.16b, v7.16b
770 eor v3.16b, v3.16b, v7.16b
771 eor v0.16b, v0.16b, v4.16b
772 eor v1.16b, v1.16b, v5.16b
773 eor v2.16b, v2.16b, v6.16b
774 st1 {v0.16b-v3.16b}, [x0], #64
775 mov v4.16b, v7.16b
782 subs w4, w4, #16
784 ld1 {v0.16b}, [x1], #16
787 eor v0.16b, v0.16b, v4.16b
789 eor v0.16b, v0.16b, v4.16b
790 st1 {v0.16b}, [x0], #16
792 subs w4, w4, #16
796 st1 {v4.16b}, [x6]
797 ldp x29, x30, [sp], #16
804 add w4, w4, #16 /* # bytes in final block */
812 ld1 {v1.16b}, [x1] /* load final block */
813 ld1 {v2.16b}, [x8]
814 ld1 {v3.16b}, [x9]
816 eor v0.16b, v0.16b, v5.16b
818 eor v0.16b, v0.16b, v5.16b
820 tbl v2.16b, {v0.16b}, v2.16b
821 tbx v0.16b, {v1.16b}, v3.16b
823 st1 {v2.16b}, [x4] /* overlapping stores */
833 ld1 {v0.16b}, [x4] /* get dg */
842 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
843 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
845 eor v0.16b, v0.16b, v2.16b
847 eor v0.16b, v0.16b, v3.16b
849 eor v0.16b, v0.16b, v4.16b
854 st1 {v0.16b}, [x4] /* return dg */
861 ld1 {v1.16b}, [x0], #16 /* get next pt block */
862 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
864 subs w3, w3, #1
873 st1 {v0.16b}, [x4] /* return dg */