1// SPDX-License-Identifier: GPL-2.0
2
3// This code is taken from the OpenSSL project but the author (Andy Polyakov)
4// has relicensed it under the GPLv2. Therefore this program is free software;
5// you can redistribute it and/or modify it under the terms of the GNU General
6// Public License version 2 as published by the Free Software Foundation.
7//
8// The original headers, including the original license headers, are
9// included below for completeness.
10
11// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
12//
13// Licensed under the OpenSSL license (the "License").  You may not use
14// this file except in compliance with the License.  You can obtain a copy
15// in the file LICENSE in the source distribution or at
16// https://www.openssl.org/source/license.html
17
18// ====================================================================
19// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
20// project. The module is, however, dual licensed under OpenSSL and
21// CRYPTOGAMS licenses depending on where you obtain it. For further
22// details see http://www.openssl.org/~appro/cryptogams/.
23// ====================================================================
24//
25// SHA256/512 for ARMv8.
26//
27// Performance in cycles per processed byte and improvement coefficient
28// over code generated with "default" compiler:
29//
30//		SHA256-hw	SHA256(*)	SHA512
31// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
32// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
33// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
34// Denver	2.01		10.5 (+26%)	6.70 (+8%)
35// X-Gene			20.0 (+100%)	12.8 (+300%(***))
36// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
37//
38// (*)	Software SHA256 results are of lesser relevance, presented
39//	mostly for informational purposes.
40// (**)	The result is a trade-off: it's possible to improve it by
41//	10% (or by 1 cycle per round), but at the cost of 20% loss
42//	on Cortex-A53 (or by 4 cycles per round).
43// (***)	Super-impressive coefficients over gcc-generated code are
44//	indication of some compiler "pathology", most notably code
45//	generated with -mgeneral-regs-only is significanty faster
46//	and the gap is only 40-90%.
47//
48// October 2016.
49//
50// Originally it was reckoned that it makes no sense to implement NEON
51// version of SHA256 for 64-bit processors. This is because performance
52// improvement on most wide-spread Cortex-A5x processors was observed
53// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
54// observed that 32-bit NEON SHA256 performs significantly better than
55// 64-bit scalar version on *some* of the more recent processors. As
56// result 64-bit NEON version of SHA256 was added to provide best
57// all-round performance. For example it executes ~30% faster on X-Gene
58// and Mongoose. [For reference, NEON version of SHA512 is bound to
59// deliver much less improvement, likely *negative* on Cortex-A5x.
60// Which is why NEON support is limited to SHA256.]
61
62#ifndef	__KERNEL__
63# include "arm_arch.h"
64#endif
65
66.text
67
68.extern	OPENSSL_armcap_P
69.globl	sha512_block_data_order
70.type	sha512_block_data_order,%function
71.align	6
72sha512_block_data_order:
73	stp	x29,x30,[sp,#-128]!
74	add	x29,sp,#0
75
76	stp	x19,x20,[sp,#16]
77	stp	x21,x22,[sp,#32]
78	stp	x23,x24,[sp,#48]
79	stp	x25,x26,[sp,#64]
80	stp	x27,x28,[sp,#80]
81	sub	sp,sp,#4*8
82
83	ldp	x20,x21,[x0]				// load context
84	ldp	x22,x23,[x0,#2*8]
85	ldp	x24,x25,[x0,#4*8]
86	add	x2,x1,x2,lsl#7	// end of input
87	ldp	x26,x27,[x0,#6*8]
88	adr	x30,.LK512
89	stp	x0,x2,[x29,#96]
90
91.Loop:
92	ldp	x3,x4,[x1],#2*8
93	ldr	x19,[x30],#8			// *K++
94	eor	x28,x21,x22				// magic seed
95	str	x1,[x29,#112]
96#ifndef	__AARCH64EB__
97	rev	x3,x3			// 0
98#endif
99	ror	x16,x24,#14
100	add	x27,x27,x19			// h+=K[i]
101	eor	x6,x24,x24,ror#23
102	and	x17,x25,x24
103	bic	x19,x26,x24
104	add	x27,x27,x3			// h+=X[i]
105	orr	x17,x17,x19			// Ch(e,f,g)
106	eor	x19,x20,x21			// a^b, b^c in next round
107	eor	x16,x16,x6,ror#18	// Sigma1(e)
108	ror	x6,x20,#28
109	add	x27,x27,x17			// h+=Ch(e,f,g)
110	eor	x17,x20,x20,ror#5
111	add	x27,x27,x16			// h+=Sigma1(e)
112	and	x28,x28,x19			// (b^c)&=(a^b)
113	add	x23,x23,x27			// d+=h
114	eor	x28,x28,x21			// Maj(a,b,c)
115	eor	x17,x6,x17,ror#34	// Sigma0(a)
116	add	x27,x27,x28			// h+=Maj(a,b,c)
117	ldr	x28,[x30],#8		// *K++, x19 in next round
118	//add	x27,x27,x17			// h+=Sigma0(a)
119#ifndef	__AARCH64EB__
120	rev	x4,x4			// 1
121#endif
122	ldp	x5,x6,[x1],#2*8
123	add	x27,x27,x17			// h+=Sigma0(a)
124	ror	x16,x23,#14
125	add	x26,x26,x28			// h+=K[i]
126	eor	x7,x23,x23,ror#23
127	and	x17,x24,x23
128	bic	x28,x25,x23
129	add	x26,x26,x4			// h+=X[i]
130	orr	x17,x17,x28			// Ch(e,f,g)
131	eor	x28,x27,x20			// a^b, b^c in next round
132	eor	x16,x16,x7,ror#18	// Sigma1(e)
133	ror	x7,x27,#28
134	add	x26,x26,x17			// h+=Ch(e,f,g)
135	eor	x17,x27,x27,ror#5
136	add	x26,x26,x16			// h+=Sigma1(e)
137	and	x19,x19,x28			// (b^c)&=(a^b)
138	add	x22,x22,x26			// d+=h
139	eor	x19,x19,x20			// Maj(a,b,c)
140	eor	x17,x7,x17,ror#34	// Sigma0(a)
141	add	x26,x26,x19			// h+=Maj(a,b,c)
142	ldr	x19,[x30],#8		// *K++, x28 in next round
143	//add	x26,x26,x17			// h+=Sigma0(a)
144#ifndef	__AARCH64EB__
145	rev	x5,x5			// 2
146#endif
147	add	x26,x26,x17			// h+=Sigma0(a)
148	ror	x16,x22,#14
149	add	x25,x25,x19			// h+=K[i]
150	eor	x8,x22,x22,ror#23
151	and	x17,x23,x22
152	bic	x19,x24,x22
153	add	x25,x25,x5			// h+=X[i]
154	orr	x17,x17,x19			// Ch(e,f,g)
155	eor	x19,x26,x27			// a^b, b^c in next round
156	eor	x16,x16,x8,ror#18	// Sigma1(e)
157	ror	x8,x26,#28
158	add	x25,x25,x17			// h+=Ch(e,f,g)
159	eor	x17,x26,x26,ror#5
160	add	x25,x25,x16			// h+=Sigma1(e)
161	and	x28,x28,x19			// (b^c)&=(a^b)
162	add	x21,x21,x25			// d+=h
163	eor	x28,x28,x27			// Maj(a,b,c)
164	eor	x17,x8,x17,ror#34	// Sigma0(a)
165	add	x25,x25,x28			// h+=Maj(a,b,c)
166	ldr	x28,[x30],#8		// *K++, x19 in next round
167	//add	x25,x25,x17			// h+=Sigma0(a)
168#ifndef	__AARCH64EB__
169	rev	x6,x6			// 3
170#endif
171	ldp	x7,x8,[x1],#2*8
172	add	x25,x25,x17			// h+=Sigma0(a)
173	ror	x16,x21,#14
174	add	x24,x24,x28			// h+=K[i]
175	eor	x9,x21,x21,ror#23
176	and	x17,x22,x21
177	bic	x28,x23,x21
178	add	x24,x24,x6			// h+=X[i]
179	orr	x17,x17,x28			// Ch(e,f,g)
180	eor	x28,x25,x26			// a^b, b^c in next round
181	eor	x16,x16,x9,ror#18	// Sigma1(e)
182	ror	x9,x25,#28
183	add	x24,x24,x17			// h+=Ch(e,f,g)
184	eor	x17,x25,x25,ror#5
185	add	x24,x24,x16			// h+=Sigma1(e)
186	and	x19,x19,x28			// (b^c)&=(a^b)
187	add	x20,x20,x24			// d+=h
188	eor	x19,x19,x26			// Maj(a,b,c)
189	eor	x17,x9,x17,ror#34	// Sigma0(a)
190	add	x24,x24,x19			// h+=Maj(a,b,c)
191	ldr	x19,[x30],#8		// *K++, x28 in next round
192	//add	x24,x24,x17			// h+=Sigma0(a)
193#ifndef	__AARCH64EB__
194	rev	x7,x7			// 4
195#endif
196	add	x24,x24,x17			// h+=Sigma0(a)
197	ror	x16,x20,#14
198	add	x23,x23,x19			// h+=K[i]
199	eor	x10,x20,x20,ror#23
200	and	x17,x21,x20
201	bic	x19,x22,x20
202	add	x23,x23,x7			// h+=X[i]
203	orr	x17,x17,x19			// Ch(e,f,g)
204	eor	x19,x24,x25			// a^b, b^c in next round
205	eor	x16,x16,x10,ror#18	// Sigma1(e)
206	ror	x10,x24,#28
207	add	x23,x23,x17			// h+=Ch(e,f,g)
208	eor	x17,x24,x24,ror#5
209	add	x23,x23,x16			// h+=Sigma1(e)
210	and	x28,x28,x19			// (b^c)&=(a^b)
211	add	x27,x27,x23			// d+=h
212	eor	x28,x28,x25			// Maj(a,b,c)
213	eor	x17,x10,x17,ror#34	// Sigma0(a)
214	add	x23,x23,x28			// h+=Maj(a,b,c)
215	ldr	x28,[x30],#8		// *K++, x19 in next round
216	//add	x23,x23,x17			// h+=Sigma0(a)
217#ifndef	__AARCH64EB__
218	rev	x8,x8			// 5
219#endif
220	ldp	x9,x10,[x1],#2*8
221	add	x23,x23,x17			// h+=Sigma0(a)
222	ror	x16,x27,#14
223	add	x22,x22,x28			// h+=K[i]
224	eor	x11,x27,x27,ror#23
225	and	x17,x20,x27
226	bic	x28,x21,x27
227	add	x22,x22,x8			// h+=X[i]
228	orr	x17,x17,x28			// Ch(e,f,g)
229	eor	x28,x23,x24			// a^b, b^c in next round
230	eor	x16,x16,x11,ror#18	// Sigma1(e)
231	ror	x11,x23,#28
232	add	x22,x22,x17			// h+=Ch(e,f,g)
233	eor	x17,x23,x23,ror#5
234	add	x22,x22,x16			// h+=Sigma1(e)
235	and	x19,x19,x28			// (b^c)&=(a^b)
236	add	x26,x26,x22			// d+=h
237	eor	x19,x19,x24			// Maj(a,b,c)
238	eor	x17,x11,x17,ror#34	// Sigma0(a)
239	add	x22,x22,x19			// h+=Maj(a,b,c)
240	ldr	x19,[x30],#8		// *K++, x28 in next round
241	//add	x22,x22,x17			// h+=Sigma0(a)
242#ifndef	__AARCH64EB__
243	rev	x9,x9			// 6
244#endif
245	add	x22,x22,x17			// h+=Sigma0(a)
246	ror	x16,x26,#14
247	add	x21,x21,x19			// h+=K[i]
248	eor	x12,x26,x26,ror#23
249	and	x17,x27,x26
250	bic	x19,x20,x26
251	add	x21,x21,x9			// h+=X[i]
252	orr	x17,x17,x19			// Ch(e,f,g)
253	eor	x19,x22,x23			// a^b, b^c in next round
254	eor	x16,x16,x12,ror#18	// Sigma1(e)
255	ror	x12,x22,#28
256	add	x21,x21,x17			// h+=Ch(e,f,g)
257	eor	x17,x22,x22,ror#5
258	add	x21,x21,x16			// h+=Sigma1(e)
259	and	x28,x28,x19			// (b^c)&=(a^b)
260	add	x25,x25,x21			// d+=h
261	eor	x28,x28,x23			// Maj(a,b,c)
262	eor	x17,x12,x17,ror#34	// Sigma0(a)
263	add	x21,x21,x28			// h+=Maj(a,b,c)
264	ldr	x28,[x30],#8		// *K++, x19 in next round
265	//add	x21,x21,x17			// h+=Sigma0(a)
266#ifndef	__AARCH64EB__
267	rev	x10,x10			// 7
268#endif
269	ldp	x11,x12,[x1],#2*8
270	add	x21,x21,x17			// h+=Sigma0(a)
271	ror	x16,x25,#14
272	add	x20,x20,x28			// h+=K[i]
273	eor	x13,x25,x25,ror#23
274	and	x17,x26,x25
275	bic	x28,x27,x25
276	add	x20,x20,x10			// h+=X[i]
277	orr	x17,x17,x28			// Ch(e,f,g)
278	eor	x28,x21,x22			// a^b, b^c in next round
279	eor	x16,x16,x13,ror#18	// Sigma1(e)
280	ror	x13,x21,#28
281	add	x20,x20,x17			// h+=Ch(e,f,g)
282	eor	x17,x21,x21,ror#5
283	add	x20,x20,x16			// h+=Sigma1(e)
284	and	x19,x19,x28			// (b^c)&=(a^b)
285	add	x24,x24,x20			// d+=h
286	eor	x19,x19,x22			// Maj(a,b,c)
287	eor	x17,x13,x17,ror#34	// Sigma0(a)
288	add	x20,x20,x19			// h+=Maj(a,b,c)
289	ldr	x19,[x30],#8		// *K++, x28 in next round
290	//add	x20,x20,x17			// h+=Sigma0(a)
291#ifndef	__AARCH64EB__
292	rev	x11,x11			// 8
293#endif
294	add	x20,x20,x17			// h+=Sigma0(a)
295	ror	x16,x24,#14
296	add	x27,x27,x19			// h+=K[i]
297	eor	x14,x24,x24,ror#23
298	and	x17,x25,x24
299	bic	x19,x26,x24
300	add	x27,x27,x11			// h+=X[i]
301	orr	x17,x17,x19			// Ch(e,f,g)
302	eor	x19,x20,x21			// a^b, b^c in next round
303	eor	x16,x16,x14,ror#18	// Sigma1(e)
304	ror	x14,x20,#28
305	add	x27,x27,x17			// h+=Ch(e,f,g)
306	eor	x17,x20,x20,ror#5
307	add	x27,x27,x16			// h+=Sigma1(e)
308	and	x28,x28,x19			// (b^c)&=(a^b)
309	add	x23,x23,x27			// d+=h
310	eor	x28,x28,x21			// Maj(a,b,c)
311	eor	x17,x14,x17,ror#34	// Sigma0(a)
312	add	x27,x27,x28			// h+=Maj(a,b,c)
313	ldr	x28,[x30],#8		// *K++, x19 in next round
314	//add	x27,x27,x17			// h+=Sigma0(a)
315#ifndef	__AARCH64EB__
316	rev	x12,x12			// 9
317#endif
318	ldp	x13,x14,[x1],#2*8
319	add	x27,x27,x17			// h+=Sigma0(a)
320	ror	x16,x23,#14
321	add	x26,x26,x28			// h+=K[i]
322	eor	x15,x23,x23,ror#23
323	and	x17,x24,x23
324	bic	x28,x25,x23
325	add	x26,x26,x12			// h+=X[i]
326	orr	x17,x17,x28			// Ch(e,f,g)
327	eor	x28,x27,x20			// a^b, b^c in next round
328	eor	x16,x16,x15,ror#18	// Sigma1(e)
329	ror	x15,x27,#28
330	add	x26,x26,x17			// h+=Ch(e,f,g)
331	eor	x17,x27,x27,ror#5
332	add	x26,x26,x16			// h+=Sigma1(e)
333	and	x19,x19,x28			// (b^c)&=(a^b)
334	add	x22,x22,x26			// d+=h
335	eor	x19,x19,x20			// Maj(a,b,c)
336	eor	x17,x15,x17,ror#34	// Sigma0(a)
337	add	x26,x26,x19			// h+=Maj(a,b,c)
338	ldr	x19,[x30],#8		// *K++, x28 in next round
339	//add	x26,x26,x17			// h+=Sigma0(a)
340#ifndef	__AARCH64EB__
341	rev	x13,x13			// 10
342#endif
343	add	x26,x26,x17			// h+=Sigma0(a)
344	ror	x16,x22,#14
345	add	x25,x25,x19			// h+=K[i]
346	eor	x0,x22,x22,ror#23
347	and	x17,x23,x22
348	bic	x19,x24,x22
349	add	x25,x25,x13			// h+=X[i]
350	orr	x17,x17,x19			// Ch(e,f,g)
351	eor	x19,x26,x27			// a^b, b^c in next round
352	eor	x16,x16,x0,ror#18	// Sigma1(e)
353	ror	x0,x26,#28
354	add	x25,x25,x17			// h+=Ch(e,f,g)
355	eor	x17,x26,x26,ror#5
356	add	x25,x25,x16			// h+=Sigma1(e)
357	and	x28,x28,x19			// (b^c)&=(a^b)
358	add	x21,x21,x25			// d+=h
359	eor	x28,x28,x27			// Maj(a,b,c)
360	eor	x17,x0,x17,ror#34	// Sigma0(a)
361	add	x25,x25,x28			// h+=Maj(a,b,c)
362	ldr	x28,[x30],#8		// *K++, x19 in next round
363	//add	x25,x25,x17			// h+=Sigma0(a)
364#ifndef	__AARCH64EB__
365	rev	x14,x14			// 11
366#endif
367	ldp	x15,x0,[x1],#2*8
368	add	x25,x25,x17			// h+=Sigma0(a)
369	str	x6,[sp,#24]
370	ror	x16,x21,#14
371	add	x24,x24,x28			// h+=K[i]
372	eor	x6,x21,x21,ror#23
373	and	x17,x22,x21
374	bic	x28,x23,x21
375	add	x24,x24,x14			// h+=X[i]
376	orr	x17,x17,x28			// Ch(e,f,g)
377	eor	x28,x25,x26			// a^b, b^c in next round
378	eor	x16,x16,x6,ror#18	// Sigma1(e)
379	ror	x6,x25,#28
380	add	x24,x24,x17			// h+=Ch(e,f,g)
381	eor	x17,x25,x25,ror#5
382	add	x24,x24,x16			// h+=Sigma1(e)
383	and	x19,x19,x28			// (b^c)&=(a^b)
384	add	x20,x20,x24			// d+=h
385	eor	x19,x19,x26			// Maj(a,b,c)
386	eor	x17,x6,x17,ror#34	// Sigma0(a)
387	add	x24,x24,x19			// h+=Maj(a,b,c)
388	ldr	x19,[x30],#8		// *K++, x28 in next round
389	//add	x24,x24,x17			// h+=Sigma0(a)
390#ifndef	__AARCH64EB__
391	rev	x15,x15			// 12
392#endif
393	add	x24,x24,x17			// h+=Sigma0(a)
394	str	x7,[sp,#0]
395	ror	x16,x20,#14
396	add	x23,x23,x19			// h+=K[i]
397	eor	x7,x20,x20,ror#23
398	and	x17,x21,x20
399	bic	x19,x22,x20
400	add	x23,x23,x15			// h+=X[i]
401	orr	x17,x17,x19			// Ch(e,f,g)
402	eor	x19,x24,x25			// a^b, b^c in next round
403	eor	x16,x16,x7,ror#18	// Sigma1(e)
404	ror	x7,x24,#28
405	add	x23,x23,x17			// h+=Ch(e,f,g)
406	eor	x17,x24,x24,ror#5
407	add	x23,x23,x16			// h+=Sigma1(e)
408	and	x28,x28,x19			// (b^c)&=(a^b)
409	add	x27,x27,x23			// d+=h
410	eor	x28,x28,x25			// Maj(a,b,c)
411	eor	x17,x7,x17,ror#34	// Sigma0(a)
412	add	x23,x23,x28			// h+=Maj(a,b,c)
413	ldr	x28,[x30],#8		// *K++, x19 in next round
414	//add	x23,x23,x17			// h+=Sigma0(a)
415#ifndef	__AARCH64EB__
416	rev	x0,x0			// 13
417#endif
418	ldp	x1,x2,[x1]
419	add	x23,x23,x17			// h+=Sigma0(a)
420	str	x8,[sp,#8]
421	ror	x16,x27,#14
422	add	x22,x22,x28			// h+=K[i]
423	eor	x8,x27,x27,ror#23
424	and	x17,x20,x27
425	bic	x28,x21,x27
426	add	x22,x22,x0			// h+=X[i]
427	orr	x17,x17,x28			// Ch(e,f,g)
428	eor	x28,x23,x24			// a^b, b^c in next round
429	eor	x16,x16,x8,ror#18	// Sigma1(e)
430	ror	x8,x23,#28
431	add	x22,x22,x17			// h+=Ch(e,f,g)
432	eor	x17,x23,x23,ror#5
433	add	x22,x22,x16			// h+=Sigma1(e)
434	and	x19,x19,x28			// (b^c)&=(a^b)
435	add	x26,x26,x22			// d+=h
436	eor	x19,x19,x24			// Maj(a,b,c)
437	eor	x17,x8,x17,ror#34	// Sigma0(a)
438	add	x22,x22,x19			// h+=Maj(a,b,c)
439	ldr	x19,[x30],#8		// *K++, x28 in next round
440	//add	x22,x22,x17			// h+=Sigma0(a)
441#ifndef	__AARCH64EB__
442	rev	x1,x1			// 14
443#endif
444	ldr	x6,[sp,#24]
445	add	x22,x22,x17			// h+=Sigma0(a)
446	str	x9,[sp,#16]
447	ror	x16,x26,#14
448	add	x21,x21,x19			// h+=K[i]
449	eor	x9,x26,x26,ror#23
450	and	x17,x27,x26
451	bic	x19,x20,x26
452	add	x21,x21,x1			// h+=X[i]
453	orr	x17,x17,x19			// Ch(e,f,g)
454	eor	x19,x22,x23			// a^b, b^c in next round
455	eor	x16,x16,x9,ror#18	// Sigma1(e)
456	ror	x9,x22,#28
457	add	x21,x21,x17			// h+=Ch(e,f,g)
458	eor	x17,x22,x22,ror#5
459	add	x21,x21,x16			// h+=Sigma1(e)
460	and	x28,x28,x19			// (b^c)&=(a^b)
461	add	x25,x25,x21			// d+=h
462	eor	x28,x28,x23			// Maj(a,b,c)
463	eor	x17,x9,x17,ror#34	// Sigma0(a)
464	add	x21,x21,x28			// h+=Maj(a,b,c)
465	ldr	x28,[x30],#8		// *K++, x19 in next round
466	//add	x21,x21,x17			// h+=Sigma0(a)
467#ifndef	__AARCH64EB__
468	rev	x2,x2			// 15
469#endif
470	ldr	x7,[sp,#0]
471	add	x21,x21,x17			// h+=Sigma0(a)
472	str	x10,[sp,#24]
473	ror	x16,x25,#14
474	add	x20,x20,x28			// h+=K[i]
475	ror	x9,x4,#1
476	and	x17,x26,x25
477	ror	x8,x1,#19
478	bic	x28,x27,x25
479	ror	x10,x21,#28
480	add	x20,x20,x2			// h+=X[i]
481	eor	x16,x16,x25,ror#18
482	eor	x9,x9,x4,ror#8
483	orr	x17,x17,x28			// Ch(e,f,g)
484	eor	x28,x21,x22			// a^b, b^c in next round
485	eor	x16,x16,x25,ror#41	// Sigma1(e)
486	eor	x10,x10,x21,ror#34
487	add	x20,x20,x17			// h+=Ch(e,f,g)
488	and	x19,x19,x28			// (b^c)&=(a^b)
489	eor	x8,x8,x1,ror#61
490	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
491	add	x20,x20,x16			// h+=Sigma1(e)
492	eor	x19,x19,x22			// Maj(a,b,c)
493	eor	x17,x10,x21,ror#39	// Sigma0(a)
494	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
495	add	x3,x3,x12
496	add	x24,x24,x20			// d+=h
497	add	x20,x20,x19			// h+=Maj(a,b,c)
498	ldr	x19,[x30],#8		// *K++, x28 in next round
499	add	x3,x3,x9
500	add	x20,x20,x17			// h+=Sigma0(a)
501	add	x3,x3,x8
502.Loop_16_xx:
503	ldr	x8,[sp,#8]
504	str	x11,[sp,#0]
505	ror	x16,x24,#14
506	add	x27,x27,x19			// h+=K[i]
507	ror	x10,x5,#1
508	and	x17,x25,x24
509	ror	x9,x2,#19
510	bic	x19,x26,x24
511	ror	x11,x20,#28
512	add	x27,x27,x3			// h+=X[i]
513	eor	x16,x16,x24,ror#18
514	eor	x10,x10,x5,ror#8
515	orr	x17,x17,x19			// Ch(e,f,g)
516	eor	x19,x20,x21			// a^b, b^c in next round
517	eor	x16,x16,x24,ror#41	// Sigma1(e)
518	eor	x11,x11,x20,ror#34
519	add	x27,x27,x17			// h+=Ch(e,f,g)
520	and	x28,x28,x19			// (b^c)&=(a^b)
521	eor	x9,x9,x2,ror#61
522	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
523	add	x27,x27,x16			// h+=Sigma1(e)
524	eor	x28,x28,x21			// Maj(a,b,c)
525	eor	x17,x11,x20,ror#39	// Sigma0(a)
526	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
527	add	x4,x4,x13
528	add	x23,x23,x27			// d+=h
529	add	x27,x27,x28			// h+=Maj(a,b,c)
530	ldr	x28,[x30],#8		// *K++, x19 in next round
531	add	x4,x4,x10
532	add	x27,x27,x17			// h+=Sigma0(a)
533	add	x4,x4,x9
534	ldr	x9,[sp,#16]
535	str	x12,[sp,#8]
536	ror	x16,x23,#14
537	add	x26,x26,x28			// h+=K[i]
538	ror	x11,x6,#1
539	and	x17,x24,x23
540	ror	x10,x3,#19
541	bic	x28,x25,x23
542	ror	x12,x27,#28
543	add	x26,x26,x4			// h+=X[i]
544	eor	x16,x16,x23,ror#18
545	eor	x11,x11,x6,ror#8
546	orr	x17,x17,x28			// Ch(e,f,g)
547	eor	x28,x27,x20			// a^b, b^c in next round
548	eor	x16,x16,x23,ror#41	// Sigma1(e)
549	eor	x12,x12,x27,ror#34
550	add	x26,x26,x17			// h+=Ch(e,f,g)
551	and	x19,x19,x28			// (b^c)&=(a^b)
552	eor	x10,x10,x3,ror#61
553	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
554	add	x26,x26,x16			// h+=Sigma1(e)
555	eor	x19,x19,x20			// Maj(a,b,c)
556	eor	x17,x12,x27,ror#39	// Sigma0(a)
557	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
558	add	x5,x5,x14
559	add	x22,x22,x26			// d+=h
560	add	x26,x26,x19			// h+=Maj(a,b,c)
561	ldr	x19,[x30],#8		// *K++, x28 in next round
562	add	x5,x5,x11
563	add	x26,x26,x17			// h+=Sigma0(a)
564	add	x5,x5,x10
565	ldr	x10,[sp,#24]
566	str	x13,[sp,#16]
567	ror	x16,x22,#14
568	add	x25,x25,x19			// h+=K[i]
569	ror	x12,x7,#1
570	and	x17,x23,x22
571	ror	x11,x4,#19
572	bic	x19,x24,x22
573	ror	x13,x26,#28
574	add	x25,x25,x5			// h+=X[i]
575	eor	x16,x16,x22,ror#18
576	eor	x12,x12,x7,ror#8
577	orr	x17,x17,x19			// Ch(e,f,g)
578	eor	x19,x26,x27			// a^b, b^c in next round
579	eor	x16,x16,x22,ror#41	// Sigma1(e)
580	eor	x13,x13,x26,ror#34
581	add	x25,x25,x17			// h+=Ch(e,f,g)
582	and	x28,x28,x19			// (b^c)&=(a^b)
583	eor	x11,x11,x4,ror#61
584	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
585	add	x25,x25,x16			// h+=Sigma1(e)
586	eor	x28,x28,x27			// Maj(a,b,c)
587	eor	x17,x13,x26,ror#39	// Sigma0(a)
588	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
589	add	x6,x6,x15
590	add	x21,x21,x25			// d+=h
591	add	x25,x25,x28			// h+=Maj(a,b,c)
592	ldr	x28,[x30],#8		// *K++, x19 in next round
593	add	x6,x6,x12
594	add	x25,x25,x17			// h+=Sigma0(a)
595	add	x6,x6,x11
596	ldr	x11,[sp,#0]
597	str	x14,[sp,#24]
598	ror	x16,x21,#14
599	add	x24,x24,x28			// h+=K[i]
600	ror	x13,x8,#1
601	and	x17,x22,x21
602	ror	x12,x5,#19
603	bic	x28,x23,x21
604	ror	x14,x25,#28
605	add	x24,x24,x6			// h+=X[i]
606	eor	x16,x16,x21,ror#18
607	eor	x13,x13,x8,ror#8
608	orr	x17,x17,x28			// Ch(e,f,g)
609	eor	x28,x25,x26			// a^b, b^c in next round
610	eor	x16,x16,x21,ror#41	// Sigma1(e)
611	eor	x14,x14,x25,ror#34
612	add	x24,x24,x17			// h+=Ch(e,f,g)
613	and	x19,x19,x28			// (b^c)&=(a^b)
614	eor	x12,x12,x5,ror#61
615	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
616	add	x24,x24,x16			// h+=Sigma1(e)
617	eor	x19,x19,x26			// Maj(a,b,c)
618	eor	x17,x14,x25,ror#39	// Sigma0(a)
619	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
620	add	x7,x7,x0
621	add	x20,x20,x24			// d+=h
622	add	x24,x24,x19			// h+=Maj(a,b,c)
623	ldr	x19,[x30],#8		// *K++, x28 in next round
624	add	x7,x7,x13
625	add	x24,x24,x17			// h+=Sigma0(a)
626	add	x7,x7,x12
627	ldr	x12,[sp,#8]
628	str	x15,[sp,#0]
629	ror	x16,x20,#14
630	add	x23,x23,x19			// h+=K[i]
631	ror	x14,x9,#1
632	and	x17,x21,x20
633	ror	x13,x6,#19
634	bic	x19,x22,x20
635	ror	x15,x24,#28
636	add	x23,x23,x7			// h+=X[i]
637	eor	x16,x16,x20,ror#18
638	eor	x14,x14,x9,ror#8
639	orr	x17,x17,x19			// Ch(e,f,g)
640	eor	x19,x24,x25			// a^b, b^c in next round
641	eor	x16,x16,x20,ror#41	// Sigma1(e)
642	eor	x15,x15,x24,ror#34
643	add	x23,x23,x17			// h+=Ch(e,f,g)
644	and	x28,x28,x19			// (b^c)&=(a^b)
645	eor	x13,x13,x6,ror#61
646	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
647	add	x23,x23,x16			// h+=Sigma1(e)
648	eor	x28,x28,x25			// Maj(a,b,c)
649	eor	x17,x15,x24,ror#39	// Sigma0(a)
650	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
651	add	x8,x8,x1
652	add	x27,x27,x23			// d+=h
653	add	x23,x23,x28			// h+=Maj(a,b,c)
654	ldr	x28,[x30],#8		// *K++, x19 in next round
655	add	x8,x8,x14
656	add	x23,x23,x17			// h+=Sigma0(a)
657	add	x8,x8,x13
658	ldr	x13,[sp,#16]
659	str	x0,[sp,#8]
660	ror	x16,x27,#14
661	add	x22,x22,x28			// h+=K[i]
662	ror	x15,x10,#1
663	and	x17,x20,x27
664	ror	x14,x7,#19
665	bic	x28,x21,x27
666	ror	x0,x23,#28
667	add	x22,x22,x8			// h+=X[i]
668	eor	x16,x16,x27,ror#18
669	eor	x15,x15,x10,ror#8
670	orr	x17,x17,x28			// Ch(e,f,g)
671	eor	x28,x23,x24			// a^b, b^c in next round
672	eor	x16,x16,x27,ror#41	// Sigma1(e)
673	eor	x0,x0,x23,ror#34
674	add	x22,x22,x17			// h+=Ch(e,f,g)
675	and	x19,x19,x28			// (b^c)&=(a^b)
676	eor	x14,x14,x7,ror#61
677	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
678	add	x22,x22,x16			// h+=Sigma1(e)
679	eor	x19,x19,x24			// Maj(a,b,c)
680	eor	x17,x0,x23,ror#39	// Sigma0(a)
681	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
682	add	x9,x9,x2
683	add	x26,x26,x22			// d+=h
684	add	x22,x22,x19			// h+=Maj(a,b,c)
685	ldr	x19,[x30],#8		// *K++, x28 in next round
686	add	x9,x9,x15
687	add	x22,x22,x17			// h+=Sigma0(a)
688	add	x9,x9,x14
689	ldr	x14,[sp,#24]
690	str	x1,[sp,#16]
691	ror	x16,x26,#14
692	add	x21,x21,x19			// h+=K[i]
693	ror	x0,x11,#1
694	and	x17,x27,x26
695	ror	x15,x8,#19
696	bic	x19,x20,x26
697	ror	x1,x22,#28
698	add	x21,x21,x9			// h+=X[i]
699	eor	x16,x16,x26,ror#18
700	eor	x0,x0,x11,ror#8
701	orr	x17,x17,x19			// Ch(e,f,g)
702	eor	x19,x22,x23			// a^b, b^c in next round
703	eor	x16,x16,x26,ror#41	// Sigma1(e)
704	eor	x1,x1,x22,ror#34
705	add	x21,x21,x17			// h+=Ch(e,f,g)
706	and	x28,x28,x19			// (b^c)&=(a^b)
707	eor	x15,x15,x8,ror#61
708	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
709	add	x21,x21,x16			// h+=Sigma1(e)
710	eor	x28,x28,x23			// Maj(a,b,c)
711	eor	x17,x1,x22,ror#39	// Sigma0(a)
712	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
713	add	x10,x10,x3
714	add	x25,x25,x21			// d+=h
715	add	x21,x21,x28			// h+=Maj(a,b,c)
716	ldr	x28,[x30],#8		// *K++, x19 in next round
717	add	x10,x10,x0
718	add	x21,x21,x17			// h+=Sigma0(a)
719	add	x10,x10,x15
720	ldr	x15,[sp,#0]
721	str	x2,[sp,#24]
722	ror	x16,x25,#14
723	add	x20,x20,x28			// h+=K[i]
724	ror	x1,x12,#1
725	and	x17,x26,x25
726	ror	x0,x9,#19
727	bic	x28,x27,x25
728	ror	x2,x21,#28
729	add	x20,x20,x10			// h+=X[i]
730	eor	x16,x16,x25,ror#18
731	eor	x1,x1,x12,ror#8
732	orr	x17,x17,x28			// Ch(e,f,g)
733	eor	x28,x21,x22			// a^b, b^c in next round
734	eor	x16,x16,x25,ror#41	// Sigma1(e)
735	eor	x2,x2,x21,ror#34
736	add	x20,x20,x17			// h+=Ch(e,f,g)
737	and	x19,x19,x28			// (b^c)&=(a^b)
738	eor	x0,x0,x9,ror#61
739	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
740	add	x20,x20,x16			// h+=Sigma1(e)
741	eor	x19,x19,x22			// Maj(a,b,c)
742	eor	x17,x2,x21,ror#39	// Sigma0(a)
743	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
744	add	x11,x11,x4
745	add	x24,x24,x20			// d+=h
746	add	x20,x20,x19			// h+=Maj(a,b,c)
747	ldr	x19,[x30],#8		// *K++, x28 in next round
748	add	x11,x11,x1
749	add	x20,x20,x17			// h+=Sigma0(a)
750	add	x11,x11,x0
751	ldr	x0,[sp,#8]
752	str	x3,[sp,#0]
753	ror	x16,x24,#14
754	add	x27,x27,x19			// h+=K[i]
755	ror	x2,x13,#1
756	and	x17,x25,x24
757	ror	x1,x10,#19
758	bic	x19,x26,x24
759	ror	x3,x20,#28
760	add	x27,x27,x11			// h+=X[i]
761	eor	x16,x16,x24,ror#18
762	eor	x2,x2,x13,ror#8
763	orr	x17,x17,x19			// Ch(e,f,g)
764	eor	x19,x20,x21			// a^b, b^c in next round
765	eor	x16,x16,x24,ror#41	// Sigma1(e)
766	eor	x3,x3,x20,ror#34
767	add	x27,x27,x17			// h+=Ch(e,f,g)
768	and	x28,x28,x19			// (b^c)&=(a^b)
769	eor	x1,x1,x10,ror#61
770	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
771	add	x27,x27,x16			// h+=Sigma1(e)
772	eor	x28,x28,x21			// Maj(a,b,c)
773	eor	x17,x3,x20,ror#39	// Sigma0(a)
774	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
775	add	x12,x12,x5
776	add	x23,x23,x27			// d+=h
777	add	x27,x27,x28			// h+=Maj(a,b,c)
778	ldr	x28,[x30],#8		// *K++, x19 in next round
779	add	x12,x12,x2
780	add	x27,x27,x17			// h+=Sigma0(a)
781	add	x12,x12,x1
782	ldr	x1,[sp,#16]
783	str	x4,[sp,#8]
784	ror	x16,x23,#14
785	add	x26,x26,x28			// h+=K[i]
786	ror	x3,x14,#1
787	and	x17,x24,x23
788	ror	x2,x11,#19
789	bic	x28,x25,x23
790	ror	x4,x27,#28
791	add	x26,x26,x12			// h+=X[i]
792	eor	x16,x16,x23,ror#18
793	eor	x3,x3,x14,ror#8
794	orr	x17,x17,x28			// Ch(e,f,g)
795	eor	x28,x27,x20			// a^b, b^c in next round
796	eor	x16,x16,x23,ror#41	// Sigma1(e)
797	eor	x4,x4,x27,ror#34
798	add	x26,x26,x17			// h+=Ch(e,f,g)
799	and	x19,x19,x28			// (b^c)&=(a^b)
800	eor	x2,x2,x11,ror#61
801	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
802	add	x26,x26,x16			// h+=Sigma1(e)
803	eor	x19,x19,x20			// Maj(a,b,c)
804	eor	x17,x4,x27,ror#39	// Sigma0(a)
805	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
806	add	x13,x13,x6
807	add	x22,x22,x26			// d+=h
808	add	x26,x26,x19			// h+=Maj(a,b,c)
809	ldr	x19,[x30],#8		// *K++, x28 in next round
810	add	x13,x13,x3
811	add	x26,x26,x17			// h+=Sigma0(a)
812	add	x13,x13,x2
813	ldr	x2,[sp,#24]
814	str	x5,[sp,#16]
815	ror	x16,x22,#14
816	add	x25,x25,x19			// h+=K[i]
817	ror	x4,x15,#1
818	and	x17,x23,x22
819	ror	x3,x12,#19
820	bic	x19,x24,x22
821	ror	x5,x26,#28
822	add	x25,x25,x13			// h+=X[i]
823	eor	x16,x16,x22,ror#18
824	eor	x4,x4,x15,ror#8
825	orr	x17,x17,x19			// Ch(e,f,g)
826	eor	x19,x26,x27			// a^b, b^c in next round
827	eor	x16,x16,x22,ror#41	// Sigma1(e)
828	eor	x5,x5,x26,ror#34
829	add	x25,x25,x17			// h+=Ch(e,f,g)
830	and	x28,x28,x19			// (b^c)&=(a^b)
831	eor	x3,x3,x12,ror#61
832	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
833	add	x25,x25,x16			// h+=Sigma1(e)
834	eor	x28,x28,x27			// Maj(a,b,c)
835	eor	x17,x5,x26,ror#39	// Sigma0(a)
836	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
837	add	x14,x14,x7
838	add	x21,x21,x25			// d+=h
839	add	x25,x25,x28			// h+=Maj(a,b,c)
840	ldr	x28,[x30],#8		// *K++, x19 in next round
841	add	x14,x14,x4
842	add	x25,x25,x17			// h+=Sigma0(a)
843	add	x14,x14,x3
844	ldr	x3,[sp,#0]
845	str	x6,[sp,#24]
846	ror	x16,x21,#14
847	add	x24,x24,x28			// h+=K[i]
848	ror	x5,x0,#1
849	and	x17,x22,x21
850	ror	x4,x13,#19
851	bic	x28,x23,x21
852	ror	x6,x25,#28
853	add	x24,x24,x14			// h+=X[i]
854	eor	x16,x16,x21,ror#18
855	eor	x5,x5,x0,ror#8
856	orr	x17,x17,x28			// Ch(e,f,g)
857	eor	x28,x25,x26			// a^b, b^c in next round
858	eor	x16,x16,x21,ror#41	// Sigma1(e)
859	eor	x6,x6,x25,ror#34
860	add	x24,x24,x17			// h+=Ch(e,f,g)
861	and	x19,x19,x28			// (b^c)&=(a^b)
862	eor	x4,x4,x13,ror#61
863	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
864	add	x24,x24,x16			// h+=Sigma1(e)
865	eor	x19,x19,x26			// Maj(a,b,c)
866	eor	x17,x6,x25,ror#39	// Sigma0(a)
867	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
868	add	x15,x15,x8
869	add	x20,x20,x24			// d+=h
870	add	x24,x24,x19			// h+=Maj(a,b,c)
871	ldr	x19,[x30],#8		// *K++, x28 in next round
872	add	x15,x15,x5
873	add	x24,x24,x17			// h+=Sigma0(a)
874	add	x15,x15,x4
875	ldr	x4,[sp,#8]
876	str	x7,[sp,#0]
877	ror	x16,x20,#14
878	add	x23,x23,x19			// h+=K[i]
879	ror	x6,x1,#1
880	and	x17,x21,x20
881	ror	x5,x14,#19
882	bic	x19,x22,x20
883	ror	x7,x24,#28
884	add	x23,x23,x15			// h+=X[i]
885	eor	x16,x16,x20,ror#18
886	eor	x6,x6,x1,ror#8
887	orr	x17,x17,x19			// Ch(e,f,g)
888	eor	x19,x24,x25			// a^b, b^c in next round
889	eor	x16,x16,x20,ror#41	// Sigma1(e)
890	eor	x7,x7,x24,ror#34
891	add	x23,x23,x17			// h+=Ch(e,f,g)
892	and	x28,x28,x19			// (b^c)&=(a^b)
893	eor	x5,x5,x14,ror#61
894	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
895	add	x23,x23,x16			// h+=Sigma1(e)
896	eor	x28,x28,x25			// Maj(a,b,c)
897	eor	x17,x7,x24,ror#39	// Sigma0(a)
898	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
899	add	x0,x0,x9
900	add	x27,x27,x23			// d+=h
901	add	x23,x23,x28			// h+=Maj(a,b,c)
902	ldr	x28,[x30],#8		// *K++, x19 in next round
903	add	x0,x0,x6
904	add	x23,x23,x17			// h+=Sigma0(a)
905	add	x0,x0,x5
906	ldr	x5,[sp,#16]
907	str	x8,[sp,#8]
908	ror	x16,x27,#14
909	add	x22,x22,x28			// h+=K[i]
910	ror	x7,x2,#1
911	and	x17,x20,x27
912	ror	x6,x15,#19
913	bic	x28,x21,x27
914	ror	x8,x23,#28
915	add	x22,x22,x0			// h+=X[i]
916	eor	x16,x16,x27,ror#18
917	eor	x7,x7,x2,ror#8
918	orr	x17,x17,x28			// Ch(e,f,g)
919	eor	x28,x23,x24			// a^b, b^c in next round
920	eor	x16,x16,x27,ror#41	// Sigma1(e)
921	eor	x8,x8,x23,ror#34
922	add	x22,x22,x17			// h+=Ch(e,f,g)
923	and	x19,x19,x28			// (b^c)&=(a^b)
924	eor	x6,x6,x15,ror#61
925	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
926	add	x22,x22,x16			// h+=Sigma1(e)
927	eor	x19,x19,x24			// Maj(a,b,c)
928	eor	x17,x8,x23,ror#39	// Sigma0(a)
929	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
930	add	x1,x1,x10
931	add	x26,x26,x22			// d+=h
932	add	x22,x22,x19			// h+=Maj(a,b,c)
933	ldr	x19,[x30],#8		// *K++, x28 in next round
934	add	x1,x1,x7
935	add	x22,x22,x17			// h+=Sigma0(a)
936	add	x1,x1,x6
937	ldr	x6,[sp,#24]
938	str	x9,[sp,#16]
939	ror	x16,x26,#14
940	add	x21,x21,x19			// h+=K[i]
941	ror	x8,x3,#1
942	and	x17,x27,x26
943	ror	x7,x0,#19
944	bic	x19,x20,x26
945	ror	x9,x22,#28
946	add	x21,x21,x1			// h+=X[i]
947	eor	x16,x16,x26,ror#18
948	eor	x8,x8,x3,ror#8
949	orr	x17,x17,x19			// Ch(e,f,g)
950	eor	x19,x22,x23			// a^b, b^c in next round
951	eor	x16,x16,x26,ror#41	// Sigma1(e)
952	eor	x9,x9,x22,ror#34
953	add	x21,x21,x17			// h+=Ch(e,f,g)
954	and	x28,x28,x19			// (b^c)&=(a^b)
955	eor	x7,x7,x0,ror#61
956	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
957	add	x21,x21,x16			// h+=Sigma1(e)
958	eor	x28,x28,x23			// Maj(a,b,c)
959	eor	x17,x9,x22,ror#39	// Sigma0(a)
960	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
961	add	x2,x2,x11
962	add	x25,x25,x21			// d+=h
963	add	x21,x21,x28			// h+=Maj(a,b,c)
964	ldr	x28,[x30],#8		// *K++, x19 in next round
965	add	x2,x2,x8
966	add	x21,x21,x17			// h+=Sigma0(a)
967	add	x2,x2,x7
968	ldr	x7,[sp,#0]
969	str	x10,[sp,#24]
970	ror	x16,x25,#14
971	add	x20,x20,x28			// h+=K[i]
972	ror	x9,x4,#1
973	and	x17,x26,x25
974	ror	x8,x1,#19
975	bic	x28,x27,x25
976	ror	x10,x21,#28
977	add	x20,x20,x2			// h+=X[i]
978	eor	x16,x16,x25,ror#18
979	eor	x9,x9,x4,ror#8
980	orr	x17,x17,x28			// Ch(e,f,g)
981	eor	x28,x21,x22			// a^b, b^c in next round
982	eor	x16,x16,x25,ror#41	// Sigma1(e)
983	eor	x10,x10,x21,ror#34
984	add	x20,x20,x17			// h+=Ch(e,f,g)
985	and	x19,x19,x28			// (b^c)&=(a^b)
986	eor	x8,x8,x1,ror#61
987	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
988	add	x20,x20,x16			// h+=Sigma1(e)
989	eor	x19,x19,x22			// Maj(a,b,c)
990	eor	x17,x10,x21,ror#39	// Sigma0(a)
991	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
992	add	x3,x3,x12
993	add	x24,x24,x20			// d+=h
994	add	x20,x20,x19			// h+=Maj(a,b,c)
995	ldr	x19,[x30],#8		// *K++, x28 in next round
996	add	x3,x3,x9
997	add	x20,x20,x17			// h+=Sigma0(a)
998	add	x3,x3,x8
999	cbnz	x19,.Loop_16_xx
1000
1001	ldp	x0,x2,[x29,#96]
1002	ldr	x1,[x29,#112]
1003	sub	x30,x30,#648		// rewind
1004
1005	ldp	x3,x4,[x0]
1006	ldp	x5,x6,[x0,#2*8]
1007	add	x1,x1,#14*8			// advance input pointer
1008	ldp	x7,x8,[x0,#4*8]
1009	add	x20,x20,x3
1010	ldp	x9,x10,[x0,#6*8]
1011	add	x21,x21,x4
1012	add	x22,x22,x5
1013	add	x23,x23,x6
1014	stp	x20,x21,[x0]
1015	add	x24,x24,x7
1016	add	x25,x25,x8
1017	stp	x22,x23,[x0,#2*8]
1018	add	x26,x26,x9
1019	add	x27,x27,x10
1020	cmp	x1,x2
1021	stp	x24,x25,[x0,#4*8]
1022	stp	x26,x27,[x0,#6*8]
1023	b.ne	.Loop
1024
1025	ldp	x19,x20,[x29,#16]
1026	add	sp,sp,#4*8
1027	ldp	x21,x22,[x29,#32]
1028	ldp	x23,x24,[x29,#48]
1029	ldp	x25,x26,[x29,#64]
1030	ldp	x27,x28,[x29,#80]
1031	ldp	x29,x30,[sp],#128
1032	ret
1033.size	sha512_block_data_order,.-sha512_block_data_order
1034
1035.align	6
1036.type	.LK512,%object
1037.LK512:
1038	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1039	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1040	.quad	0x3956c25bf348b538,0x59f111f1b605d019
1041	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1042	.quad	0xd807aa98a3030242,0x12835b0145706fbe
1043	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1044	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1045	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1046	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1047	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1048	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1049	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1050	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1051	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1052	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1053	.quad	0x06ca6351e003826f,0x142929670a0e6e70
1054	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1055	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1056	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1057	.quad	0x81c2c92e47edaee6,0x92722c851482353b
1058	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1059	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1060	.quad	0xd192e819d6ef5218,0xd69906245565a910
1061	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1062	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1063	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1064	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1065	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1066	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1067	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1068	.quad	0x90befffa23631e28,0xa4506cebde82bde9
1069	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1070	.quad	0xca273eceea26619c,0xd186b8c721c0c207
1071	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1072	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1073	.quad	0x113f9804bef90dae,0x1b710b35131c471b
1074	.quad	0x28db77f523047d84,0x32caab7b40c72493
1075	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1076	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1077	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1078	.quad	0	// terminator
1079.size	.LK512,.-.LK512
1080#ifndef	__KERNEL__
1081.align	3
1082.LOPENSSL_armcap_P:
1083# ifdef	__ILP32__
1084	.long	OPENSSL_armcap_P-.
1085# else
1086	.quad	OPENSSL_armcap_P-.
1087# endif
1088#endif
1089.asciz	"SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
1090.align	2
1091#ifndef	__KERNEL__
1092.comm	OPENSSL_armcap_P,4,4
1093#endif
1094