xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/sha512-armv8-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
8//
9// Licensed under the OpenSSL license (the "License").  You may not use
10// this file except in compliance with the License.  You can obtain a copy
11// in the file LICENSE in the source distribution or at
12// https://www.openssl.org/source/license.html
13
14// ====================================================================
15// Written by Andy Polyakov <[email protected]> for the OpenSSL
16// project. The module is, however, dual licensed under OpenSSL and
17// CRYPTOGAMS licenses depending on where you obtain it. For further
18// details see http://www.openssl.org/~appro/cryptogams/.
19//
20// Permission to use under GPLv2 terms is granted.
21// ====================================================================
22//
23// SHA256/512 for ARMv8.
24//
25// Performance in cycles per processed byte and improvement coefficient
26// over code generated with "default" compiler:
27//
28//		SHA256-hw	SHA256(*)	SHA512
29// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
30// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
31// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
32// Denver	2.01		10.5 (+26%)	6.70 (+8%)
33// X-Gene			20.0 (+100%)	12.8 (+300%(***))
34// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
35// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
36//
37// (*)	Software SHA256 results are of lesser relevance, presented
38//	mostly for informational purposes.
39// (**)	The result is a trade-off: it's possible to improve it by
40//	10% (or by 1 cycle per round), but at the cost of 20% loss
41//	on Cortex-A53 (or by 4 cycles per round).
42// (***)	Super-impressive coefficients over gcc-generated code are
43//	indication of some compiler "pathology", most notably code
44//	generated with -mgeneral-regs-only is significantly faster
45//	and the gap is only 40-90%.
46
47#ifndef	__KERNEL__
48# include <openssl/arm_arch.h>
49#endif
50
51.text
52
53.globl	sha512_block_data_order_nohw
54.hidden	sha512_block_data_order_nohw
55.type	sha512_block_data_order_nohw,%function
56.align	6
57sha512_block_data_order_nohw:
58	AARCH64_SIGN_LINK_REGISTER
59	stp	x29,x30,[sp,#-128]!
60	add	x29,sp,#0
61
62	stp	x19,x20,[sp,#16]
63	stp	x21,x22,[sp,#32]
64	stp	x23,x24,[sp,#48]
65	stp	x25,x26,[sp,#64]
66	stp	x27,x28,[sp,#80]
67	sub	sp,sp,#4*8
68
69	ldp	x20,x21,[x0]				// load context
70	ldp	x22,x23,[x0,#2*8]
71	ldp	x24,x25,[x0,#4*8]
72	add	x2,x1,x2,lsl#7	// end of input
73	ldp	x26,x27,[x0,#6*8]
74	adrp	x30,.LK512
75	add	x30,x30,:lo12:.LK512
76	stp	x0,x2,[x29,#96]
77
78.Loop:
79	ldp	x3,x4,[x1],#2*8
80	ldr	x19,[x30],#8			// *K++
81	eor	x28,x21,x22				// magic seed
82	str	x1,[x29,#112]
83#ifndef	__AARCH64EB__
84	rev	x3,x3			// 0
85#endif
86	ror	x16,x24,#14
87	add	x27,x27,x19			// h+=K[i]
88	eor	x6,x24,x24,ror#23
89	and	x17,x25,x24
90	bic	x19,x26,x24
91	add	x27,x27,x3			// h+=X[i]
92	orr	x17,x17,x19			// Ch(e,f,g)
93	eor	x19,x20,x21			// a^b, b^c in next round
94	eor	x16,x16,x6,ror#18	// Sigma1(e)
95	ror	x6,x20,#28
96	add	x27,x27,x17			// h+=Ch(e,f,g)
97	eor	x17,x20,x20,ror#5
98	add	x27,x27,x16			// h+=Sigma1(e)
99	and	x28,x28,x19			// (b^c)&=(a^b)
100	add	x23,x23,x27			// d+=h
101	eor	x28,x28,x21			// Maj(a,b,c)
102	eor	x17,x6,x17,ror#34	// Sigma0(a)
103	add	x27,x27,x28			// h+=Maj(a,b,c)
104	ldr	x28,[x30],#8		// *K++, x19 in next round
105	//add	x27,x27,x17			// h+=Sigma0(a)
106#ifndef	__AARCH64EB__
107	rev	x4,x4			// 1
108#endif
109	ldp	x5,x6,[x1],#2*8
110	add	x27,x27,x17			// h+=Sigma0(a)
111	ror	x16,x23,#14
112	add	x26,x26,x28			// h+=K[i]
113	eor	x7,x23,x23,ror#23
114	and	x17,x24,x23
115	bic	x28,x25,x23
116	add	x26,x26,x4			// h+=X[i]
117	orr	x17,x17,x28			// Ch(e,f,g)
118	eor	x28,x27,x20			// a^b, b^c in next round
119	eor	x16,x16,x7,ror#18	// Sigma1(e)
120	ror	x7,x27,#28
121	add	x26,x26,x17			// h+=Ch(e,f,g)
122	eor	x17,x27,x27,ror#5
123	add	x26,x26,x16			// h+=Sigma1(e)
124	and	x19,x19,x28			// (b^c)&=(a^b)
125	add	x22,x22,x26			// d+=h
126	eor	x19,x19,x20			// Maj(a,b,c)
127	eor	x17,x7,x17,ror#34	// Sigma0(a)
128	add	x26,x26,x19			// h+=Maj(a,b,c)
129	ldr	x19,[x30],#8		// *K++, x28 in next round
130	//add	x26,x26,x17			// h+=Sigma0(a)
131#ifndef	__AARCH64EB__
132	rev	x5,x5			// 2
133#endif
134	add	x26,x26,x17			// h+=Sigma0(a)
135	ror	x16,x22,#14
136	add	x25,x25,x19			// h+=K[i]
137	eor	x8,x22,x22,ror#23
138	and	x17,x23,x22
139	bic	x19,x24,x22
140	add	x25,x25,x5			// h+=X[i]
141	orr	x17,x17,x19			// Ch(e,f,g)
142	eor	x19,x26,x27			// a^b, b^c in next round
143	eor	x16,x16,x8,ror#18	// Sigma1(e)
144	ror	x8,x26,#28
145	add	x25,x25,x17			// h+=Ch(e,f,g)
146	eor	x17,x26,x26,ror#5
147	add	x25,x25,x16			// h+=Sigma1(e)
148	and	x28,x28,x19			// (b^c)&=(a^b)
149	add	x21,x21,x25			// d+=h
150	eor	x28,x28,x27			// Maj(a,b,c)
151	eor	x17,x8,x17,ror#34	// Sigma0(a)
152	add	x25,x25,x28			// h+=Maj(a,b,c)
153	ldr	x28,[x30],#8		// *K++, x19 in next round
154	//add	x25,x25,x17			// h+=Sigma0(a)
155#ifndef	__AARCH64EB__
156	rev	x6,x6			// 3
157#endif
158	ldp	x7,x8,[x1],#2*8
159	add	x25,x25,x17			// h+=Sigma0(a)
160	ror	x16,x21,#14
161	add	x24,x24,x28			// h+=K[i]
162	eor	x9,x21,x21,ror#23
163	and	x17,x22,x21
164	bic	x28,x23,x21
165	add	x24,x24,x6			// h+=X[i]
166	orr	x17,x17,x28			// Ch(e,f,g)
167	eor	x28,x25,x26			// a^b, b^c in next round
168	eor	x16,x16,x9,ror#18	// Sigma1(e)
169	ror	x9,x25,#28
170	add	x24,x24,x17			// h+=Ch(e,f,g)
171	eor	x17,x25,x25,ror#5
172	add	x24,x24,x16			// h+=Sigma1(e)
173	and	x19,x19,x28			// (b^c)&=(a^b)
174	add	x20,x20,x24			// d+=h
175	eor	x19,x19,x26			// Maj(a,b,c)
176	eor	x17,x9,x17,ror#34	// Sigma0(a)
177	add	x24,x24,x19			// h+=Maj(a,b,c)
178	ldr	x19,[x30],#8		// *K++, x28 in next round
179	//add	x24,x24,x17			// h+=Sigma0(a)
180#ifndef	__AARCH64EB__
181	rev	x7,x7			// 4
182#endif
183	add	x24,x24,x17			// h+=Sigma0(a)
184	ror	x16,x20,#14
185	add	x23,x23,x19			// h+=K[i]
186	eor	x10,x20,x20,ror#23
187	and	x17,x21,x20
188	bic	x19,x22,x20
189	add	x23,x23,x7			// h+=X[i]
190	orr	x17,x17,x19			// Ch(e,f,g)
191	eor	x19,x24,x25			// a^b, b^c in next round
192	eor	x16,x16,x10,ror#18	// Sigma1(e)
193	ror	x10,x24,#28
194	add	x23,x23,x17			// h+=Ch(e,f,g)
195	eor	x17,x24,x24,ror#5
196	add	x23,x23,x16			// h+=Sigma1(e)
197	and	x28,x28,x19			// (b^c)&=(a^b)
198	add	x27,x27,x23			// d+=h
199	eor	x28,x28,x25			// Maj(a,b,c)
200	eor	x17,x10,x17,ror#34	// Sigma0(a)
201	add	x23,x23,x28			// h+=Maj(a,b,c)
202	ldr	x28,[x30],#8		// *K++, x19 in next round
203	//add	x23,x23,x17			// h+=Sigma0(a)
204#ifndef	__AARCH64EB__
205	rev	x8,x8			// 5
206#endif
207	ldp	x9,x10,[x1],#2*8
208	add	x23,x23,x17			// h+=Sigma0(a)
209	ror	x16,x27,#14
210	add	x22,x22,x28			// h+=K[i]
211	eor	x11,x27,x27,ror#23
212	and	x17,x20,x27
213	bic	x28,x21,x27
214	add	x22,x22,x8			// h+=X[i]
215	orr	x17,x17,x28			// Ch(e,f,g)
216	eor	x28,x23,x24			// a^b, b^c in next round
217	eor	x16,x16,x11,ror#18	// Sigma1(e)
218	ror	x11,x23,#28
219	add	x22,x22,x17			// h+=Ch(e,f,g)
220	eor	x17,x23,x23,ror#5
221	add	x22,x22,x16			// h+=Sigma1(e)
222	and	x19,x19,x28			// (b^c)&=(a^b)
223	add	x26,x26,x22			// d+=h
224	eor	x19,x19,x24			// Maj(a,b,c)
225	eor	x17,x11,x17,ror#34	// Sigma0(a)
226	add	x22,x22,x19			// h+=Maj(a,b,c)
227	ldr	x19,[x30],#8		// *K++, x28 in next round
228	//add	x22,x22,x17			// h+=Sigma0(a)
229#ifndef	__AARCH64EB__
230	rev	x9,x9			// 6
231#endif
232	add	x22,x22,x17			// h+=Sigma0(a)
233	ror	x16,x26,#14
234	add	x21,x21,x19			// h+=K[i]
235	eor	x12,x26,x26,ror#23
236	and	x17,x27,x26
237	bic	x19,x20,x26
238	add	x21,x21,x9			// h+=X[i]
239	orr	x17,x17,x19			// Ch(e,f,g)
240	eor	x19,x22,x23			// a^b, b^c in next round
241	eor	x16,x16,x12,ror#18	// Sigma1(e)
242	ror	x12,x22,#28
243	add	x21,x21,x17			// h+=Ch(e,f,g)
244	eor	x17,x22,x22,ror#5
245	add	x21,x21,x16			// h+=Sigma1(e)
246	and	x28,x28,x19			// (b^c)&=(a^b)
247	add	x25,x25,x21			// d+=h
248	eor	x28,x28,x23			// Maj(a,b,c)
249	eor	x17,x12,x17,ror#34	// Sigma0(a)
250	add	x21,x21,x28			// h+=Maj(a,b,c)
251	ldr	x28,[x30],#8		// *K++, x19 in next round
252	//add	x21,x21,x17			// h+=Sigma0(a)
253#ifndef	__AARCH64EB__
254	rev	x10,x10			// 7
255#endif
256	ldp	x11,x12,[x1],#2*8
257	add	x21,x21,x17			// h+=Sigma0(a)
258	ror	x16,x25,#14
259	add	x20,x20,x28			// h+=K[i]
260	eor	x13,x25,x25,ror#23
261	and	x17,x26,x25
262	bic	x28,x27,x25
263	add	x20,x20,x10			// h+=X[i]
264	orr	x17,x17,x28			// Ch(e,f,g)
265	eor	x28,x21,x22			// a^b, b^c in next round
266	eor	x16,x16,x13,ror#18	// Sigma1(e)
267	ror	x13,x21,#28
268	add	x20,x20,x17			// h+=Ch(e,f,g)
269	eor	x17,x21,x21,ror#5
270	add	x20,x20,x16			// h+=Sigma1(e)
271	and	x19,x19,x28			// (b^c)&=(a^b)
272	add	x24,x24,x20			// d+=h
273	eor	x19,x19,x22			// Maj(a,b,c)
274	eor	x17,x13,x17,ror#34	// Sigma0(a)
275	add	x20,x20,x19			// h+=Maj(a,b,c)
276	ldr	x19,[x30],#8		// *K++, x28 in next round
277	//add	x20,x20,x17			// h+=Sigma0(a)
278#ifndef	__AARCH64EB__
279	rev	x11,x11			// 8
280#endif
281	add	x20,x20,x17			// h+=Sigma0(a)
282	ror	x16,x24,#14
283	add	x27,x27,x19			// h+=K[i]
284	eor	x14,x24,x24,ror#23
285	and	x17,x25,x24
286	bic	x19,x26,x24
287	add	x27,x27,x11			// h+=X[i]
288	orr	x17,x17,x19			// Ch(e,f,g)
289	eor	x19,x20,x21			// a^b, b^c in next round
290	eor	x16,x16,x14,ror#18	// Sigma1(e)
291	ror	x14,x20,#28
292	add	x27,x27,x17			// h+=Ch(e,f,g)
293	eor	x17,x20,x20,ror#5
294	add	x27,x27,x16			// h+=Sigma1(e)
295	and	x28,x28,x19			// (b^c)&=(a^b)
296	add	x23,x23,x27			// d+=h
297	eor	x28,x28,x21			// Maj(a,b,c)
298	eor	x17,x14,x17,ror#34	// Sigma0(a)
299	add	x27,x27,x28			// h+=Maj(a,b,c)
300	ldr	x28,[x30],#8		// *K++, x19 in next round
301	//add	x27,x27,x17			// h+=Sigma0(a)
302#ifndef	__AARCH64EB__
303	rev	x12,x12			// 9
304#endif
305	ldp	x13,x14,[x1],#2*8
306	add	x27,x27,x17			// h+=Sigma0(a)
307	ror	x16,x23,#14
308	add	x26,x26,x28			// h+=K[i]
309	eor	x15,x23,x23,ror#23
310	and	x17,x24,x23
311	bic	x28,x25,x23
312	add	x26,x26,x12			// h+=X[i]
313	orr	x17,x17,x28			// Ch(e,f,g)
314	eor	x28,x27,x20			// a^b, b^c in next round
315	eor	x16,x16,x15,ror#18	// Sigma1(e)
316	ror	x15,x27,#28
317	add	x26,x26,x17			// h+=Ch(e,f,g)
318	eor	x17,x27,x27,ror#5
319	add	x26,x26,x16			// h+=Sigma1(e)
320	and	x19,x19,x28			// (b^c)&=(a^b)
321	add	x22,x22,x26			// d+=h
322	eor	x19,x19,x20			// Maj(a,b,c)
323	eor	x17,x15,x17,ror#34	// Sigma0(a)
324	add	x26,x26,x19			// h+=Maj(a,b,c)
325	ldr	x19,[x30],#8		// *K++, x28 in next round
326	//add	x26,x26,x17			// h+=Sigma0(a)
327#ifndef	__AARCH64EB__
328	rev	x13,x13			// 10
329#endif
330	add	x26,x26,x17			// h+=Sigma0(a)
331	ror	x16,x22,#14
332	add	x25,x25,x19			// h+=K[i]
333	eor	x0,x22,x22,ror#23
334	and	x17,x23,x22
335	bic	x19,x24,x22
336	add	x25,x25,x13			// h+=X[i]
337	orr	x17,x17,x19			// Ch(e,f,g)
338	eor	x19,x26,x27			// a^b, b^c in next round
339	eor	x16,x16,x0,ror#18	// Sigma1(e)
340	ror	x0,x26,#28
341	add	x25,x25,x17			// h+=Ch(e,f,g)
342	eor	x17,x26,x26,ror#5
343	add	x25,x25,x16			// h+=Sigma1(e)
344	and	x28,x28,x19			// (b^c)&=(a^b)
345	add	x21,x21,x25			// d+=h
346	eor	x28,x28,x27			// Maj(a,b,c)
347	eor	x17,x0,x17,ror#34	// Sigma0(a)
348	add	x25,x25,x28			// h+=Maj(a,b,c)
349	ldr	x28,[x30],#8		// *K++, x19 in next round
350	//add	x25,x25,x17			// h+=Sigma0(a)
351#ifndef	__AARCH64EB__
352	rev	x14,x14			// 11
353#endif
354	ldp	x15,x0,[x1],#2*8
355	add	x25,x25,x17			// h+=Sigma0(a)
356	str	x6,[sp,#24]
357	ror	x16,x21,#14
358	add	x24,x24,x28			// h+=K[i]
359	eor	x6,x21,x21,ror#23
360	and	x17,x22,x21
361	bic	x28,x23,x21
362	add	x24,x24,x14			// h+=X[i]
363	orr	x17,x17,x28			// Ch(e,f,g)
364	eor	x28,x25,x26			// a^b, b^c in next round
365	eor	x16,x16,x6,ror#18	// Sigma1(e)
366	ror	x6,x25,#28
367	add	x24,x24,x17			// h+=Ch(e,f,g)
368	eor	x17,x25,x25,ror#5
369	add	x24,x24,x16			// h+=Sigma1(e)
370	and	x19,x19,x28			// (b^c)&=(a^b)
371	add	x20,x20,x24			// d+=h
372	eor	x19,x19,x26			// Maj(a,b,c)
373	eor	x17,x6,x17,ror#34	// Sigma0(a)
374	add	x24,x24,x19			// h+=Maj(a,b,c)
375	ldr	x19,[x30],#8		// *K++, x28 in next round
376	//add	x24,x24,x17			// h+=Sigma0(a)
377#ifndef	__AARCH64EB__
378	rev	x15,x15			// 12
379#endif
380	add	x24,x24,x17			// h+=Sigma0(a)
381	str	x7,[sp,#0]
382	ror	x16,x20,#14
383	add	x23,x23,x19			// h+=K[i]
384	eor	x7,x20,x20,ror#23
385	and	x17,x21,x20
386	bic	x19,x22,x20
387	add	x23,x23,x15			// h+=X[i]
388	orr	x17,x17,x19			// Ch(e,f,g)
389	eor	x19,x24,x25			// a^b, b^c in next round
390	eor	x16,x16,x7,ror#18	// Sigma1(e)
391	ror	x7,x24,#28
392	add	x23,x23,x17			// h+=Ch(e,f,g)
393	eor	x17,x24,x24,ror#5
394	add	x23,x23,x16			// h+=Sigma1(e)
395	and	x28,x28,x19			// (b^c)&=(a^b)
396	add	x27,x27,x23			// d+=h
397	eor	x28,x28,x25			// Maj(a,b,c)
398	eor	x17,x7,x17,ror#34	// Sigma0(a)
399	add	x23,x23,x28			// h+=Maj(a,b,c)
400	ldr	x28,[x30],#8		// *K++, x19 in next round
401	//add	x23,x23,x17			// h+=Sigma0(a)
402#ifndef	__AARCH64EB__
403	rev	x0,x0			// 13
404#endif
405	ldp	x1,x2,[x1]
406	add	x23,x23,x17			// h+=Sigma0(a)
407	str	x8,[sp,#8]
408	ror	x16,x27,#14
409	add	x22,x22,x28			// h+=K[i]
410	eor	x8,x27,x27,ror#23
411	and	x17,x20,x27
412	bic	x28,x21,x27
413	add	x22,x22,x0			// h+=X[i]
414	orr	x17,x17,x28			// Ch(e,f,g)
415	eor	x28,x23,x24			// a^b, b^c in next round
416	eor	x16,x16,x8,ror#18	// Sigma1(e)
417	ror	x8,x23,#28
418	add	x22,x22,x17			// h+=Ch(e,f,g)
419	eor	x17,x23,x23,ror#5
420	add	x22,x22,x16			// h+=Sigma1(e)
421	and	x19,x19,x28			// (b^c)&=(a^b)
422	add	x26,x26,x22			// d+=h
423	eor	x19,x19,x24			// Maj(a,b,c)
424	eor	x17,x8,x17,ror#34	// Sigma0(a)
425	add	x22,x22,x19			// h+=Maj(a,b,c)
426	ldr	x19,[x30],#8		// *K++, x28 in next round
427	//add	x22,x22,x17			// h+=Sigma0(a)
428#ifndef	__AARCH64EB__
429	rev	x1,x1			// 14
430#endif
431	ldr	x6,[sp,#24]
432	add	x22,x22,x17			// h+=Sigma0(a)
433	str	x9,[sp,#16]
434	ror	x16,x26,#14
435	add	x21,x21,x19			// h+=K[i]
436	eor	x9,x26,x26,ror#23
437	and	x17,x27,x26
438	bic	x19,x20,x26
439	add	x21,x21,x1			// h+=X[i]
440	orr	x17,x17,x19			// Ch(e,f,g)
441	eor	x19,x22,x23			// a^b, b^c in next round
442	eor	x16,x16,x9,ror#18	// Sigma1(e)
443	ror	x9,x22,#28
444	add	x21,x21,x17			// h+=Ch(e,f,g)
445	eor	x17,x22,x22,ror#5
446	add	x21,x21,x16			// h+=Sigma1(e)
447	and	x28,x28,x19			// (b^c)&=(a^b)
448	add	x25,x25,x21			// d+=h
449	eor	x28,x28,x23			// Maj(a,b,c)
450	eor	x17,x9,x17,ror#34	// Sigma0(a)
451	add	x21,x21,x28			// h+=Maj(a,b,c)
452	ldr	x28,[x30],#8		// *K++, x19 in next round
453	//add	x21,x21,x17			// h+=Sigma0(a)
454#ifndef	__AARCH64EB__
455	rev	x2,x2			// 15
456#endif
457	ldr	x7,[sp,#0]
458	add	x21,x21,x17			// h+=Sigma0(a)
459	str	x10,[sp,#24]
460	ror	x16,x25,#14
461	add	x20,x20,x28			// h+=K[i]
462	ror	x9,x4,#1
463	and	x17,x26,x25
464	ror	x8,x1,#19
465	bic	x28,x27,x25
466	ror	x10,x21,#28
467	add	x20,x20,x2			// h+=X[i]
468	eor	x16,x16,x25,ror#18
469	eor	x9,x9,x4,ror#8
470	orr	x17,x17,x28			// Ch(e,f,g)
471	eor	x28,x21,x22			// a^b, b^c in next round
472	eor	x16,x16,x25,ror#41	// Sigma1(e)
473	eor	x10,x10,x21,ror#34
474	add	x20,x20,x17			// h+=Ch(e,f,g)
475	and	x19,x19,x28			// (b^c)&=(a^b)
476	eor	x8,x8,x1,ror#61
477	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
478	add	x20,x20,x16			// h+=Sigma1(e)
479	eor	x19,x19,x22			// Maj(a,b,c)
480	eor	x17,x10,x21,ror#39	// Sigma0(a)
481	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
482	add	x3,x3,x12
483	add	x24,x24,x20			// d+=h
484	add	x20,x20,x19			// h+=Maj(a,b,c)
485	ldr	x19,[x30],#8		// *K++, x28 in next round
486	add	x3,x3,x9
487	add	x20,x20,x17			// h+=Sigma0(a)
488	add	x3,x3,x8
489.Loop_16_xx:
490	ldr	x8,[sp,#8]
491	str	x11,[sp,#0]
492	ror	x16,x24,#14
493	add	x27,x27,x19			// h+=K[i]
494	ror	x10,x5,#1
495	and	x17,x25,x24
496	ror	x9,x2,#19
497	bic	x19,x26,x24
498	ror	x11,x20,#28
499	add	x27,x27,x3			// h+=X[i]
500	eor	x16,x16,x24,ror#18
501	eor	x10,x10,x5,ror#8
502	orr	x17,x17,x19			// Ch(e,f,g)
503	eor	x19,x20,x21			// a^b, b^c in next round
504	eor	x16,x16,x24,ror#41	// Sigma1(e)
505	eor	x11,x11,x20,ror#34
506	add	x27,x27,x17			// h+=Ch(e,f,g)
507	and	x28,x28,x19			// (b^c)&=(a^b)
508	eor	x9,x9,x2,ror#61
509	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
510	add	x27,x27,x16			// h+=Sigma1(e)
511	eor	x28,x28,x21			// Maj(a,b,c)
512	eor	x17,x11,x20,ror#39	// Sigma0(a)
513	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
514	add	x4,x4,x13
515	add	x23,x23,x27			// d+=h
516	add	x27,x27,x28			// h+=Maj(a,b,c)
517	ldr	x28,[x30],#8		// *K++, x19 in next round
518	add	x4,x4,x10
519	add	x27,x27,x17			// h+=Sigma0(a)
520	add	x4,x4,x9
521	ldr	x9,[sp,#16]
522	str	x12,[sp,#8]
523	ror	x16,x23,#14
524	add	x26,x26,x28			// h+=K[i]
525	ror	x11,x6,#1
526	and	x17,x24,x23
527	ror	x10,x3,#19
528	bic	x28,x25,x23
529	ror	x12,x27,#28
530	add	x26,x26,x4			// h+=X[i]
531	eor	x16,x16,x23,ror#18
532	eor	x11,x11,x6,ror#8
533	orr	x17,x17,x28			// Ch(e,f,g)
534	eor	x28,x27,x20			// a^b, b^c in next round
535	eor	x16,x16,x23,ror#41	// Sigma1(e)
536	eor	x12,x12,x27,ror#34
537	add	x26,x26,x17			// h+=Ch(e,f,g)
538	and	x19,x19,x28			// (b^c)&=(a^b)
539	eor	x10,x10,x3,ror#61
540	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
541	add	x26,x26,x16			// h+=Sigma1(e)
542	eor	x19,x19,x20			// Maj(a,b,c)
543	eor	x17,x12,x27,ror#39	// Sigma0(a)
544	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
545	add	x5,x5,x14
546	add	x22,x22,x26			// d+=h
547	add	x26,x26,x19			// h+=Maj(a,b,c)
548	ldr	x19,[x30],#8		// *K++, x28 in next round
549	add	x5,x5,x11
550	add	x26,x26,x17			// h+=Sigma0(a)
551	add	x5,x5,x10
552	ldr	x10,[sp,#24]
553	str	x13,[sp,#16]
554	ror	x16,x22,#14
555	add	x25,x25,x19			// h+=K[i]
556	ror	x12,x7,#1
557	and	x17,x23,x22
558	ror	x11,x4,#19
559	bic	x19,x24,x22
560	ror	x13,x26,#28
561	add	x25,x25,x5			// h+=X[i]
562	eor	x16,x16,x22,ror#18
563	eor	x12,x12,x7,ror#8
564	orr	x17,x17,x19			// Ch(e,f,g)
565	eor	x19,x26,x27			// a^b, b^c in next round
566	eor	x16,x16,x22,ror#41	// Sigma1(e)
567	eor	x13,x13,x26,ror#34
568	add	x25,x25,x17			// h+=Ch(e,f,g)
569	and	x28,x28,x19			// (b^c)&=(a^b)
570	eor	x11,x11,x4,ror#61
571	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
572	add	x25,x25,x16			// h+=Sigma1(e)
573	eor	x28,x28,x27			// Maj(a,b,c)
574	eor	x17,x13,x26,ror#39	// Sigma0(a)
575	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
576	add	x6,x6,x15
577	add	x21,x21,x25			// d+=h
578	add	x25,x25,x28			// h+=Maj(a,b,c)
579	ldr	x28,[x30],#8		// *K++, x19 in next round
580	add	x6,x6,x12
581	add	x25,x25,x17			// h+=Sigma0(a)
582	add	x6,x6,x11
583	ldr	x11,[sp,#0]
584	str	x14,[sp,#24]
585	ror	x16,x21,#14
586	add	x24,x24,x28			// h+=K[i]
587	ror	x13,x8,#1
588	and	x17,x22,x21
589	ror	x12,x5,#19
590	bic	x28,x23,x21
591	ror	x14,x25,#28
592	add	x24,x24,x6			// h+=X[i]
593	eor	x16,x16,x21,ror#18
594	eor	x13,x13,x8,ror#8
595	orr	x17,x17,x28			// Ch(e,f,g)
596	eor	x28,x25,x26			// a^b, b^c in next round
597	eor	x16,x16,x21,ror#41	// Sigma1(e)
598	eor	x14,x14,x25,ror#34
599	add	x24,x24,x17			// h+=Ch(e,f,g)
600	and	x19,x19,x28			// (b^c)&=(a^b)
601	eor	x12,x12,x5,ror#61
602	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
603	add	x24,x24,x16			// h+=Sigma1(e)
604	eor	x19,x19,x26			// Maj(a,b,c)
605	eor	x17,x14,x25,ror#39	// Sigma0(a)
606	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
607	add	x7,x7,x0
608	add	x20,x20,x24			// d+=h
609	add	x24,x24,x19			// h+=Maj(a,b,c)
610	ldr	x19,[x30],#8		// *K++, x28 in next round
611	add	x7,x7,x13
612	add	x24,x24,x17			// h+=Sigma0(a)
613	add	x7,x7,x12
614	ldr	x12,[sp,#8]
615	str	x15,[sp,#0]
616	ror	x16,x20,#14
617	add	x23,x23,x19			// h+=K[i]
618	ror	x14,x9,#1
619	and	x17,x21,x20
620	ror	x13,x6,#19
621	bic	x19,x22,x20
622	ror	x15,x24,#28
623	add	x23,x23,x7			// h+=X[i]
624	eor	x16,x16,x20,ror#18
625	eor	x14,x14,x9,ror#8
626	orr	x17,x17,x19			// Ch(e,f,g)
627	eor	x19,x24,x25			// a^b, b^c in next round
628	eor	x16,x16,x20,ror#41	// Sigma1(e)
629	eor	x15,x15,x24,ror#34
630	add	x23,x23,x17			// h+=Ch(e,f,g)
631	and	x28,x28,x19			// (b^c)&=(a^b)
632	eor	x13,x13,x6,ror#61
633	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
634	add	x23,x23,x16			// h+=Sigma1(e)
635	eor	x28,x28,x25			// Maj(a,b,c)
636	eor	x17,x15,x24,ror#39	// Sigma0(a)
637	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
638	add	x8,x8,x1
639	add	x27,x27,x23			// d+=h
640	add	x23,x23,x28			// h+=Maj(a,b,c)
641	ldr	x28,[x30],#8		// *K++, x19 in next round
642	add	x8,x8,x14
643	add	x23,x23,x17			// h+=Sigma0(a)
644	add	x8,x8,x13
645	ldr	x13,[sp,#16]
646	str	x0,[sp,#8]
647	ror	x16,x27,#14
648	add	x22,x22,x28			// h+=K[i]
649	ror	x15,x10,#1
650	and	x17,x20,x27
651	ror	x14,x7,#19
652	bic	x28,x21,x27
653	ror	x0,x23,#28
654	add	x22,x22,x8			// h+=X[i]
655	eor	x16,x16,x27,ror#18
656	eor	x15,x15,x10,ror#8
657	orr	x17,x17,x28			// Ch(e,f,g)
658	eor	x28,x23,x24			// a^b, b^c in next round
659	eor	x16,x16,x27,ror#41	// Sigma1(e)
660	eor	x0,x0,x23,ror#34
661	add	x22,x22,x17			// h+=Ch(e,f,g)
662	and	x19,x19,x28			// (b^c)&=(a^b)
663	eor	x14,x14,x7,ror#61
664	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
665	add	x22,x22,x16			// h+=Sigma1(e)
666	eor	x19,x19,x24			// Maj(a,b,c)
667	eor	x17,x0,x23,ror#39	// Sigma0(a)
668	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
669	add	x9,x9,x2
670	add	x26,x26,x22			// d+=h
671	add	x22,x22,x19			// h+=Maj(a,b,c)
672	ldr	x19,[x30],#8		// *K++, x28 in next round
673	add	x9,x9,x15
674	add	x22,x22,x17			// h+=Sigma0(a)
675	add	x9,x9,x14
676	ldr	x14,[sp,#24]
677	str	x1,[sp,#16]
678	ror	x16,x26,#14
679	add	x21,x21,x19			// h+=K[i]
680	ror	x0,x11,#1
681	and	x17,x27,x26
682	ror	x15,x8,#19
683	bic	x19,x20,x26
684	ror	x1,x22,#28
685	add	x21,x21,x9			// h+=X[i]
686	eor	x16,x16,x26,ror#18
687	eor	x0,x0,x11,ror#8
688	orr	x17,x17,x19			// Ch(e,f,g)
689	eor	x19,x22,x23			// a^b, b^c in next round
690	eor	x16,x16,x26,ror#41	// Sigma1(e)
691	eor	x1,x1,x22,ror#34
692	add	x21,x21,x17			// h+=Ch(e,f,g)
693	and	x28,x28,x19			// (b^c)&=(a^b)
694	eor	x15,x15,x8,ror#61
695	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
696	add	x21,x21,x16			// h+=Sigma1(e)
697	eor	x28,x28,x23			// Maj(a,b,c)
698	eor	x17,x1,x22,ror#39	// Sigma0(a)
699	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
700	add	x10,x10,x3
701	add	x25,x25,x21			// d+=h
702	add	x21,x21,x28			// h+=Maj(a,b,c)
703	ldr	x28,[x30],#8		// *K++, x19 in next round
704	add	x10,x10,x0
705	add	x21,x21,x17			// h+=Sigma0(a)
706	add	x10,x10,x15
707	ldr	x15,[sp,#0]
708	str	x2,[sp,#24]
709	ror	x16,x25,#14
710	add	x20,x20,x28			// h+=K[i]
711	ror	x1,x12,#1
712	and	x17,x26,x25
713	ror	x0,x9,#19
714	bic	x28,x27,x25
715	ror	x2,x21,#28
716	add	x20,x20,x10			// h+=X[i]
717	eor	x16,x16,x25,ror#18
718	eor	x1,x1,x12,ror#8
719	orr	x17,x17,x28			// Ch(e,f,g)
720	eor	x28,x21,x22			// a^b, b^c in next round
721	eor	x16,x16,x25,ror#41	// Sigma1(e)
722	eor	x2,x2,x21,ror#34
723	add	x20,x20,x17			// h+=Ch(e,f,g)
724	and	x19,x19,x28			// (b^c)&=(a^b)
725	eor	x0,x0,x9,ror#61
726	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
727	add	x20,x20,x16			// h+=Sigma1(e)
728	eor	x19,x19,x22			// Maj(a,b,c)
729	eor	x17,x2,x21,ror#39	// Sigma0(a)
730	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
731	add	x11,x11,x4
732	add	x24,x24,x20			// d+=h
733	add	x20,x20,x19			// h+=Maj(a,b,c)
734	ldr	x19,[x30],#8		// *K++, x28 in next round
735	add	x11,x11,x1
736	add	x20,x20,x17			// h+=Sigma0(a)
737	add	x11,x11,x0
738	ldr	x0,[sp,#8]
739	str	x3,[sp,#0]
740	ror	x16,x24,#14
741	add	x27,x27,x19			// h+=K[i]
742	ror	x2,x13,#1
743	and	x17,x25,x24
744	ror	x1,x10,#19
745	bic	x19,x26,x24
746	ror	x3,x20,#28
747	add	x27,x27,x11			// h+=X[i]
748	eor	x16,x16,x24,ror#18
749	eor	x2,x2,x13,ror#8
750	orr	x17,x17,x19			// Ch(e,f,g)
751	eor	x19,x20,x21			// a^b, b^c in next round
752	eor	x16,x16,x24,ror#41	// Sigma1(e)
753	eor	x3,x3,x20,ror#34
754	add	x27,x27,x17			// h+=Ch(e,f,g)
755	and	x28,x28,x19			// (b^c)&=(a^b)
756	eor	x1,x1,x10,ror#61
757	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
758	add	x27,x27,x16			// h+=Sigma1(e)
759	eor	x28,x28,x21			// Maj(a,b,c)
760	eor	x17,x3,x20,ror#39	// Sigma0(a)
761	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
762	add	x12,x12,x5
763	add	x23,x23,x27			// d+=h
764	add	x27,x27,x28			// h+=Maj(a,b,c)
765	ldr	x28,[x30],#8		// *K++, x19 in next round
766	add	x12,x12,x2
767	add	x27,x27,x17			// h+=Sigma0(a)
768	add	x12,x12,x1
769	ldr	x1,[sp,#16]
770	str	x4,[sp,#8]
771	ror	x16,x23,#14
772	add	x26,x26,x28			// h+=K[i]
773	ror	x3,x14,#1
774	and	x17,x24,x23
775	ror	x2,x11,#19
776	bic	x28,x25,x23
777	ror	x4,x27,#28
778	add	x26,x26,x12			// h+=X[i]
779	eor	x16,x16,x23,ror#18
780	eor	x3,x3,x14,ror#8
781	orr	x17,x17,x28			// Ch(e,f,g)
782	eor	x28,x27,x20			// a^b, b^c in next round
783	eor	x16,x16,x23,ror#41	// Sigma1(e)
784	eor	x4,x4,x27,ror#34
785	add	x26,x26,x17			// h+=Ch(e,f,g)
786	and	x19,x19,x28			// (b^c)&=(a^b)
787	eor	x2,x2,x11,ror#61
788	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
789	add	x26,x26,x16			// h+=Sigma1(e)
790	eor	x19,x19,x20			// Maj(a,b,c)
791	eor	x17,x4,x27,ror#39	// Sigma0(a)
792	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
793	add	x13,x13,x6
794	add	x22,x22,x26			// d+=h
795	add	x26,x26,x19			// h+=Maj(a,b,c)
796	ldr	x19,[x30],#8		// *K++, x28 in next round
797	add	x13,x13,x3
798	add	x26,x26,x17			// h+=Sigma0(a)
799	add	x13,x13,x2
800	ldr	x2,[sp,#24]
801	str	x5,[sp,#16]
802	ror	x16,x22,#14
803	add	x25,x25,x19			// h+=K[i]
804	ror	x4,x15,#1
805	and	x17,x23,x22
806	ror	x3,x12,#19
807	bic	x19,x24,x22
808	ror	x5,x26,#28
809	add	x25,x25,x13			// h+=X[i]
810	eor	x16,x16,x22,ror#18
811	eor	x4,x4,x15,ror#8
812	orr	x17,x17,x19			// Ch(e,f,g)
813	eor	x19,x26,x27			// a^b, b^c in next round
814	eor	x16,x16,x22,ror#41	// Sigma1(e)
815	eor	x5,x5,x26,ror#34
816	add	x25,x25,x17			// h+=Ch(e,f,g)
817	and	x28,x28,x19			// (b^c)&=(a^b)
818	eor	x3,x3,x12,ror#61
819	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
820	add	x25,x25,x16			// h+=Sigma1(e)
821	eor	x28,x28,x27			// Maj(a,b,c)
822	eor	x17,x5,x26,ror#39	// Sigma0(a)
823	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
824	add	x14,x14,x7
825	add	x21,x21,x25			// d+=h
826	add	x25,x25,x28			// h+=Maj(a,b,c)
827	ldr	x28,[x30],#8		// *K++, x19 in next round
828	add	x14,x14,x4
829	add	x25,x25,x17			// h+=Sigma0(a)
830	add	x14,x14,x3
831	ldr	x3,[sp,#0]
832	str	x6,[sp,#24]
833	ror	x16,x21,#14
834	add	x24,x24,x28			// h+=K[i]
835	ror	x5,x0,#1
836	and	x17,x22,x21
837	ror	x4,x13,#19
838	bic	x28,x23,x21
839	ror	x6,x25,#28
840	add	x24,x24,x14			// h+=X[i]
841	eor	x16,x16,x21,ror#18
842	eor	x5,x5,x0,ror#8
843	orr	x17,x17,x28			// Ch(e,f,g)
844	eor	x28,x25,x26			// a^b, b^c in next round
845	eor	x16,x16,x21,ror#41	// Sigma1(e)
846	eor	x6,x6,x25,ror#34
847	add	x24,x24,x17			// h+=Ch(e,f,g)
848	and	x19,x19,x28			// (b^c)&=(a^b)
849	eor	x4,x4,x13,ror#61
850	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
851	add	x24,x24,x16			// h+=Sigma1(e)
852	eor	x19,x19,x26			// Maj(a,b,c)
853	eor	x17,x6,x25,ror#39	// Sigma0(a)
854	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
855	add	x15,x15,x8
856	add	x20,x20,x24			// d+=h
857	add	x24,x24,x19			// h+=Maj(a,b,c)
858	ldr	x19,[x30],#8		// *K++, x28 in next round
859	add	x15,x15,x5
860	add	x24,x24,x17			// h+=Sigma0(a)
861	add	x15,x15,x4
862	ldr	x4,[sp,#8]
863	str	x7,[sp,#0]
864	ror	x16,x20,#14
865	add	x23,x23,x19			// h+=K[i]
866	ror	x6,x1,#1
867	and	x17,x21,x20
868	ror	x5,x14,#19
869	bic	x19,x22,x20
870	ror	x7,x24,#28
871	add	x23,x23,x15			// h+=X[i]
872	eor	x16,x16,x20,ror#18
873	eor	x6,x6,x1,ror#8
874	orr	x17,x17,x19			// Ch(e,f,g)
875	eor	x19,x24,x25			// a^b, b^c in next round
876	eor	x16,x16,x20,ror#41	// Sigma1(e)
877	eor	x7,x7,x24,ror#34
878	add	x23,x23,x17			// h+=Ch(e,f,g)
879	and	x28,x28,x19			// (b^c)&=(a^b)
880	eor	x5,x5,x14,ror#61
881	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
882	add	x23,x23,x16			// h+=Sigma1(e)
883	eor	x28,x28,x25			// Maj(a,b,c)
884	eor	x17,x7,x24,ror#39	// Sigma0(a)
885	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
886	add	x0,x0,x9
887	add	x27,x27,x23			// d+=h
888	add	x23,x23,x28			// h+=Maj(a,b,c)
889	ldr	x28,[x30],#8		// *K++, x19 in next round
890	add	x0,x0,x6
891	add	x23,x23,x17			// h+=Sigma0(a)
892	add	x0,x0,x5
893	ldr	x5,[sp,#16]
894	str	x8,[sp,#8]
895	ror	x16,x27,#14
896	add	x22,x22,x28			// h+=K[i]
897	ror	x7,x2,#1
898	and	x17,x20,x27
899	ror	x6,x15,#19
900	bic	x28,x21,x27
901	ror	x8,x23,#28
902	add	x22,x22,x0			// h+=X[i]
903	eor	x16,x16,x27,ror#18
904	eor	x7,x7,x2,ror#8
905	orr	x17,x17,x28			// Ch(e,f,g)
906	eor	x28,x23,x24			// a^b, b^c in next round
907	eor	x16,x16,x27,ror#41	// Sigma1(e)
908	eor	x8,x8,x23,ror#34
909	add	x22,x22,x17			// h+=Ch(e,f,g)
910	and	x19,x19,x28			// (b^c)&=(a^b)
911	eor	x6,x6,x15,ror#61
912	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
913	add	x22,x22,x16			// h+=Sigma1(e)
914	eor	x19,x19,x24			// Maj(a,b,c)
915	eor	x17,x8,x23,ror#39	// Sigma0(a)
916	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
917	add	x1,x1,x10
918	add	x26,x26,x22			// d+=h
919	add	x22,x22,x19			// h+=Maj(a,b,c)
920	ldr	x19,[x30],#8		// *K++, x28 in next round
921	add	x1,x1,x7
922	add	x22,x22,x17			// h+=Sigma0(a)
923	add	x1,x1,x6
924	ldr	x6,[sp,#24]
925	str	x9,[sp,#16]
926	ror	x16,x26,#14
927	add	x21,x21,x19			// h+=K[i]
928	ror	x8,x3,#1
929	and	x17,x27,x26
930	ror	x7,x0,#19
931	bic	x19,x20,x26
932	ror	x9,x22,#28
933	add	x21,x21,x1			// h+=X[i]
934	eor	x16,x16,x26,ror#18
935	eor	x8,x8,x3,ror#8
936	orr	x17,x17,x19			// Ch(e,f,g)
937	eor	x19,x22,x23			// a^b, b^c in next round
938	eor	x16,x16,x26,ror#41	// Sigma1(e)
939	eor	x9,x9,x22,ror#34
940	add	x21,x21,x17			// h+=Ch(e,f,g)
941	and	x28,x28,x19			// (b^c)&=(a^b)
942	eor	x7,x7,x0,ror#61
943	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
944	add	x21,x21,x16			// h+=Sigma1(e)
945	eor	x28,x28,x23			// Maj(a,b,c)
946	eor	x17,x9,x22,ror#39	// Sigma0(a)
947	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
948	add	x2,x2,x11
949	add	x25,x25,x21			// d+=h
950	add	x21,x21,x28			// h+=Maj(a,b,c)
951	ldr	x28,[x30],#8		// *K++, x19 in next round
952	add	x2,x2,x8
953	add	x21,x21,x17			// h+=Sigma0(a)
954	add	x2,x2,x7
955	ldr	x7,[sp,#0]
956	str	x10,[sp,#24]
957	ror	x16,x25,#14
958	add	x20,x20,x28			// h+=K[i]
959	ror	x9,x4,#1
960	and	x17,x26,x25
961	ror	x8,x1,#19
962	bic	x28,x27,x25
963	ror	x10,x21,#28
964	add	x20,x20,x2			// h+=X[i]
965	eor	x16,x16,x25,ror#18
966	eor	x9,x9,x4,ror#8
967	orr	x17,x17,x28			// Ch(e,f,g)
968	eor	x28,x21,x22			// a^b, b^c in next round
969	eor	x16,x16,x25,ror#41	// Sigma1(e)
970	eor	x10,x10,x21,ror#34
971	add	x20,x20,x17			// h+=Ch(e,f,g)
972	and	x19,x19,x28			// (b^c)&=(a^b)
973	eor	x8,x8,x1,ror#61
974	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
975	add	x20,x20,x16			// h+=Sigma1(e)
976	eor	x19,x19,x22			// Maj(a,b,c)
977	eor	x17,x10,x21,ror#39	// Sigma0(a)
978	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
979	add	x3,x3,x12
980	add	x24,x24,x20			// d+=h
981	add	x20,x20,x19			// h+=Maj(a,b,c)
982	ldr	x19,[x30],#8		// *K++, x28 in next round
983	add	x3,x3,x9
984	add	x20,x20,x17			// h+=Sigma0(a)
985	add	x3,x3,x8
986	cbnz	x19,.Loop_16_xx
987
988	ldp	x0,x2,[x29,#96]
989	ldr	x1,[x29,#112]
990	sub	x30,x30,#648		// rewind
991
992	ldp	x3,x4,[x0]
993	ldp	x5,x6,[x0,#2*8]
994	add	x1,x1,#14*8			// advance input pointer
995	ldp	x7,x8,[x0,#4*8]
996	add	x20,x20,x3
997	ldp	x9,x10,[x0,#6*8]
998	add	x21,x21,x4
999	add	x22,x22,x5
1000	add	x23,x23,x6
1001	stp	x20,x21,[x0]
1002	add	x24,x24,x7
1003	add	x25,x25,x8
1004	stp	x22,x23,[x0,#2*8]
1005	add	x26,x26,x9
1006	add	x27,x27,x10
1007	cmp	x1,x2
1008	stp	x24,x25,[x0,#4*8]
1009	stp	x26,x27,[x0,#6*8]
1010	b.ne	.Loop
1011
1012	ldp	x19,x20,[x29,#16]
1013	add	sp,sp,#4*8
1014	ldp	x21,x22,[x29,#32]
1015	ldp	x23,x24,[x29,#48]
1016	ldp	x25,x26,[x29,#64]
1017	ldp	x27,x28,[x29,#80]
1018	ldp	x29,x30,[sp],#128
1019	AARCH64_VALIDATE_LINK_REGISTER
1020	ret
1021.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
1022
1023.section	.rodata
1024.align	6
1025.type	.LK512,%object
1026.LK512:
1027.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1028.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1029.quad	0x3956c25bf348b538,0x59f111f1b605d019
1030.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1031.quad	0xd807aa98a3030242,0x12835b0145706fbe
1032.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1033.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1034.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1035.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1036.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1037.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1038.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1039.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1040.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1041.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1042.quad	0x06ca6351e003826f,0x142929670a0e6e70
1043.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1044.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1045.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1046.quad	0x81c2c92e47edaee6,0x92722c851482353b
1047.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1048.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1049.quad	0xd192e819d6ef5218,0xd69906245565a910
1050.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1051.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1052.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1053.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1054.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1055.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1056.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1057.quad	0x90befffa23631e28,0xa4506cebde82bde9
1058.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1059.quad	0xca273eceea26619c,0xd186b8c721c0c207
1060.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1061.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1062.quad	0x113f9804bef90dae,0x1b710b35131c471b
1063.quad	0x28db77f523047d84,0x32caab7b40c72493
1064.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1065.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1066.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1067.quad	0	// terminator
1068.size	.LK512,.-.LK512
1069.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1070.align	2
1071.align	2
1072.text
1073#ifndef	__KERNEL__
1074.globl	sha512_block_data_order_hw
1075.hidden	sha512_block_data_order_hw
1076.type	sha512_block_data_order_hw,%function
1077.align	6
1078sha512_block_data_order_hw:
1079	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1080	AARCH64_VALID_CALL_TARGET
1081	stp	x29,x30,[sp,#-16]!
1082	add	x29,sp,#0
1083
1084	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1085	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1086
1087	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1088	adrp	x3,.LK512
1089	add	x3,x3,:lo12:.LK512
1090
1091	rev64	v16.16b,v16.16b
1092	rev64	v17.16b,v17.16b
1093	rev64	v18.16b,v18.16b
1094	rev64	v19.16b,v19.16b
1095	rev64	v20.16b,v20.16b
1096	rev64	v21.16b,v21.16b
1097	rev64	v22.16b,v22.16b
1098	rev64	v23.16b,v23.16b
1099	b	.Loop_hw
1100
1101.align	4
1102.Loop_hw:
1103	ld1	{v24.2d},[x3],#16
1104	subs	x2,x2,#1
1105	sub	x4,x1,#128
1106	orr	v26.16b,v0.16b,v0.16b			// offload
1107	orr	v27.16b,v1.16b,v1.16b
1108	orr	v28.16b,v2.16b,v2.16b
1109	orr	v29.16b,v3.16b,v3.16b
1110	csel	x1,x1,x4,ne			// conditional rewind
1111	add	v24.2d,v24.2d,v16.2d
1112	ld1	{v25.2d},[x3],#16
1113	ext	v24.16b,v24.16b,v24.16b,#8
1114	ext	v5.16b,v2.16b,v3.16b,#8
1115	ext	v6.16b,v1.16b,v2.16b,#8
1116	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1117.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1118	ext	v7.16b,v20.16b,v21.16b,#8
1119.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1120.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1121	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1122.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1123	add	v25.2d,v25.2d,v17.2d
1124	ld1	{v24.2d},[x3],#16
1125	ext	v25.16b,v25.16b,v25.16b,#8
1126	ext	v5.16b,v4.16b,v2.16b,#8
1127	ext	v6.16b,v0.16b,v4.16b,#8
1128	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1129.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1130	ext	v7.16b,v21.16b,v22.16b,#8
1131.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1132.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1133	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1134.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1135	add	v24.2d,v24.2d,v18.2d
1136	ld1	{v25.2d},[x3],#16
1137	ext	v24.16b,v24.16b,v24.16b,#8
1138	ext	v5.16b,v1.16b,v4.16b,#8
1139	ext	v6.16b,v3.16b,v1.16b,#8
1140	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1141.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1142	ext	v7.16b,v22.16b,v23.16b,#8
1143.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1144.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1145	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1146.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1147	add	v25.2d,v25.2d,v19.2d
1148	ld1	{v24.2d},[x3],#16
1149	ext	v25.16b,v25.16b,v25.16b,#8
1150	ext	v5.16b,v0.16b,v1.16b,#8
1151	ext	v6.16b,v2.16b,v0.16b,#8
1152	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1153.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1154	ext	v7.16b,v23.16b,v16.16b,#8
1155.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1156.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1157	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1158.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1159	add	v24.2d,v24.2d,v20.2d
1160	ld1	{v25.2d},[x3],#16
1161	ext	v24.16b,v24.16b,v24.16b,#8
1162	ext	v5.16b,v3.16b,v0.16b,#8
1163	ext	v6.16b,v4.16b,v3.16b,#8
1164	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1165.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1166	ext	v7.16b,v16.16b,v17.16b,#8
1167.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1168.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1169	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1170.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1171	add	v25.2d,v25.2d,v21.2d
1172	ld1	{v24.2d},[x3],#16
1173	ext	v25.16b,v25.16b,v25.16b,#8
1174	ext	v5.16b,v2.16b,v3.16b,#8
1175	ext	v6.16b,v1.16b,v2.16b,#8
1176	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1177.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1178	ext	v7.16b,v17.16b,v18.16b,#8
1179.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1180.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1181	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1182.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1183	add	v24.2d,v24.2d,v22.2d
1184	ld1	{v25.2d},[x3],#16
1185	ext	v24.16b,v24.16b,v24.16b,#8
1186	ext	v5.16b,v4.16b,v2.16b,#8
1187	ext	v6.16b,v0.16b,v4.16b,#8
1188	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1189.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1190	ext	v7.16b,v18.16b,v19.16b,#8
1191.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1192.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1193	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1194.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1195	add	v25.2d,v25.2d,v23.2d
1196	ld1	{v24.2d},[x3],#16
1197	ext	v25.16b,v25.16b,v25.16b,#8
1198	ext	v5.16b,v1.16b,v4.16b,#8
1199	ext	v6.16b,v3.16b,v1.16b,#8
1200	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1201.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1202	ext	v7.16b,v19.16b,v20.16b,#8
1203.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1204.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1205	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1206.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1207	add	v24.2d,v24.2d,v16.2d
1208	ld1	{v25.2d},[x3],#16
1209	ext	v24.16b,v24.16b,v24.16b,#8
1210	ext	v5.16b,v0.16b,v1.16b,#8
1211	ext	v6.16b,v2.16b,v0.16b,#8
1212	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1213.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1214	ext	v7.16b,v20.16b,v21.16b,#8
1215.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1216.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1217	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1218.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1219	add	v25.2d,v25.2d,v17.2d
1220	ld1	{v24.2d},[x3],#16
1221	ext	v25.16b,v25.16b,v25.16b,#8
1222	ext	v5.16b,v3.16b,v0.16b,#8
1223	ext	v6.16b,v4.16b,v3.16b,#8
1224	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1225.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1226	ext	v7.16b,v21.16b,v22.16b,#8
1227.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1228.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1229	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1230.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1231	add	v24.2d,v24.2d,v18.2d
1232	ld1	{v25.2d},[x3],#16
1233	ext	v24.16b,v24.16b,v24.16b,#8
1234	ext	v5.16b,v2.16b,v3.16b,#8
1235	ext	v6.16b,v1.16b,v2.16b,#8
1236	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1237.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1238	ext	v7.16b,v22.16b,v23.16b,#8
1239.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1240.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1241	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1242.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1243	add	v25.2d,v25.2d,v19.2d
1244	ld1	{v24.2d},[x3],#16
1245	ext	v25.16b,v25.16b,v25.16b,#8
1246	ext	v5.16b,v4.16b,v2.16b,#8
1247	ext	v6.16b,v0.16b,v4.16b,#8
1248	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1249.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1250	ext	v7.16b,v23.16b,v16.16b,#8
1251.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1252.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1253	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1254.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1255	add	v24.2d,v24.2d,v20.2d
1256	ld1	{v25.2d},[x3],#16
1257	ext	v24.16b,v24.16b,v24.16b,#8
1258	ext	v5.16b,v1.16b,v4.16b,#8
1259	ext	v6.16b,v3.16b,v1.16b,#8
1260	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1261.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1262	ext	v7.16b,v16.16b,v17.16b,#8
1263.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1264.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1265	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1266.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1267	add	v25.2d,v25.2d,v21.2d
1268	ld1	{v24.2d},[x3],#16
1269	ext	v25.16b,v25.16b,v25.16b,#8
1270	ext	v5.16b,v0.16b,v1.16b,#8
1271	ext	v6.16b,v2.16b,v0.16b,#8
1272	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1273.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1274	ext	v7.16b,v17.16b,v18.16b,#8
1275.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1276.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1277	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1278.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1279	add	v24.2d,v24.2d,v22.2d
1280	ld1	{v25.2d},[x3],#16
1281	ext	v24.16b,v24.16b,v24.16b,#8
1282	ext	v5.16b,v3.16b,v0.16b,#8
1283	ext	v6.16b,v4.16b,v3.16b,#8
1284	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1285.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1286	ext	v7.16b,v18.16b,v19.16b,#8
1287.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1288.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1289	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1290.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1291	add	v25.2d,v25.2d,v23.2d
1292	ld1	{v24.2d},[x3],#16
1293	ext	v25.16b,v25.16b,v25.16b,#8
1294	ext	v5.16b,v2.16b,v3.16b,#8
1295	ext	v6.16b,v1.16b,v2.16b,#8
1296	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1297.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1298	ext	v7.16b,v19.16b,v20.16b,#8
1299.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1300.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1301	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1302.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1303	add	v24.2d,v24.2d,v16.2d
1304	ld1	{v25.2d},[x3],#16
1305	ext	v24.16b,v24.16b,v24.16b,#8
1306	ext	v5.16b,v4.16b,v2.16b,#8
1307	ext	v6.16b,v0.16b,v4.16b,#8
1308	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1309.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1310	ext	v7.16b,v20.16b,v21.16b,#8
1311.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1312.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1313	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1314.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1315	add	v25.2d,v25.2d,v17.2d
1316	ld1	{v24.2d},[x3],#16
1317	ext	v25.16b,v25.16b,v25.16b,#8
1318	ext	v5.16b,v1.16b,v4.16b,#8
1319	ext	v6.16b,v3.16b,v1.16b,#8
1320	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1321.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1322	ext	v7.16b,v21.16b,v22.16b,#8
1323.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1324.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1325	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1326.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1327	add	v24.2d,v24.2d,v18.2d
1328	ld1	{v25.2d},[x3],#16
1329	ext	v24.16b,v24.16b,v24.16b,#8
1330	ext	v5.16b,v0.16b,v1.16b,#8
1331	ext	v6.16b,v2.16b,v0.16b,#8
1332	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1333.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1334	ext	v7.16b,v22.16b,v23.16b,#8
1335.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1336.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1337	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1338.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1339	add	v25.2d,v25.2d,v19.2d
1340	ld1	{v24.2d},[x3],#16
1341	ext	v25.16b,v25.16b,v25.16b,#8
1342	ext	v5.16b,v3.16b,v0.16b,#8
1343	ext	v6.16b,v4.16b,v3.16b,#8
1344	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1345.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1346	ext	v7.16b,v23.16b,v16.16b,#8
1347.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1348.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1349	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1350.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1351	add	v24.2d,v24.2d,v20.2d
1352	ld1	{v25.2d},[x3],#16
1353	ext	v24.16b,v24.16b,v24.16b,#8
1354	ext	v5.16b,v2.16b,v3.16b,#8
1355	ext	v6.16b,v1.16b,v2.16b,#8
1356	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1357.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1358	ext	v7.16b,v16.16b,v17.16b,#8
1359.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1360.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1361	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1362.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1363	add	v25.2d,v25.2d,v21.2d
1364	ld1	{v24.2d},[x3],#16
1365	ext	v25.16b,v25.16b,v25.16b,#8
1366	ext	v5.16b,v4.16b,v2.16b,#8
1367	ext	v6.16b,v0.16b,v4.16b,#8
1368	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1369.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1370	ext	v7.16b,v17.16b,v18.16b,#8
1371.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1372.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1373	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1374.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1375	add	v24.2d,v24.2d,v22.2d
1376	ld1	{v25.2d},[x3],#16
1377	ext	v24.16b,v24.16b,v24.16b,#8
1378	ext	v5.16b,v1.16b,v4.16b,#8
1379	ext	v6.16b,v3.16b,v1.16b,#8
1380	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1381.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1382	ext	v7.16b,v18.16b,v19.16b,#8
1383.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1384.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1385	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1386.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1387	add	v25.2d,v25.2d,v23.2d
1388	ld1	{v24.2d},[x3],#16
1389	ext	v25.16b,v25.16b,v25.16b,#8
1390	ext	v5.16b,v0.16b,v1.16b,#8
1391	ext	v6.16b,v2.16b,v0.16b,#8
1392	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1393.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1394	ext	v7.16b,v19.16b,v20.16b,#8
1395.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1396.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1397	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1398.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1399	add	v24.2d,v24.2d,v16.2d
1400	ld1	{v25.2d},[x3],#16
1401	ext	v24.16b,v24.16b,v24.16b,#8
1402	ext	v5.16b,v3.16b,v0.16b,#8
1403	ext	v6.16b,v4.16b,v3.16b,#8
1404	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1405.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1406	ext	v7.16b,v20.16b,v21.16b,#8
1407.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1408.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1409	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1410.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1411	add	v25.2d,v25.2d,v17.2d
1412	ld1	{v24.2d},[x3],#16
1413	ext	v25.16b,v25.16b,v25.16b,#8
1414	ext	v5.16b,v2.16b,v3.16b,#8
1415	ext	v6.16b,v1.16b,v2.16b,#8
1416	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1417.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1418	ext	v7.16b,v21.16b,v22.16b,#8
1419.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1420.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1421	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1422.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1423	add	v24.2d,v24.2d,v18.2d
1424	ld1	{v25.2d},[x3],#16
1425	ext	v24.16b,v24.16b,v24.16b,#8
1426	ext	v5.16b,v4.16b,v2.16b,#8
1427	ext	v6.16b,v0.16b,v4.16b,#8
1428	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1429.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1430	ext	v7.16b,v22.16b,v23.16b,#8
1431.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1432.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1433	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1434.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1435	add	v25.2d,v25.2d,v19.2d
1436	ld1	{v24.2d},[x3],#16
1437	ext	v25.16b,v25.16b,v25.16b,#8
1438	ext	v5.16b,v1.16b,v4.16b,#8
1439	ext	v6.16b,v3.16b,v1.16b,#8
1440	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1441.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1442	ext	v7.16b,v23.16b,v16.16b,#8
1443.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1444.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1445	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1446.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1447	add	v24.2d,v24.2d,v20.2d
1448	ld1	{v25.2d},[x3],#16
1449	ext	v24.16b,v24.16b,v24.16b,#8
1450	ext	v5.16b,v0.16b,v1.16b,#8
1451	ext	v6.16b,v2.16b,v0.16b,#8
1452	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1453.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1454	ext	v7.16b,v16.16b,v17.16b,#8
1455.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1456.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1457	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1458.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1459	add	v25.2d,v25.2d,v21.2d
1460	ld1	{v24.2d},[x3],#16
1461	ext	v25.16b,v25.16b,v25.16b,#8
1462	ext	v5.16b,v3.16b,v0.16b,#8
1463	ext	v6.16b,v4.16b,v3.16b,#8
1464	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1465.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1466	ext	v7.16b,v17.16b,v18.16b,#8
1467.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1468.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1469	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1470.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1471	add	v24.2d,v24.2d,v22.2d
1472	ld1	{v25.2d},[x3],#16
1473	ext	v24.16b,v24.16b,v24.16b,#8
1474	ext	v5.16b,v2.16b,v3.16b,#8
1475	ext	v6.16b,v1.16b,v2.16b,#8
1476	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1477.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1478	ext	v7.16b,v18.16b,v19.16b,#8
1479.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1480.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1481	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1482.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1483	add	v25.2d,v25.2d,v23.2d
1484	ld1	{v24.2d},[x3],#16
1485	ext	v25.16b,v25.16b,v25.16b,#8
1486	ext	v5.16b,v4.16b,v2.16b,#8
1487	ext	v6.16b,v0.16b,v4.16b,#8
1488	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1489.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1490	ext	v7.16b,v19.16b,v20.16b,#8
1491.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1492.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1493	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1494.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1495	ld1	{v25.2d},[x3],#16
1496	add	v24.2d,v24.2d,v16.2d
1497	ld1	{v16.16b},[x1],#16		// load next input
1498	ext	v24.16b,v24.16b,v24.16b,#8
1499	ext	v5.16b,v1.16b,v4.16b,#8
1500	ext	v6.16b,v3.16b,v1.16b,#8
1501	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1502.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1503	rev64	v16.16b,v16.16b
1504	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1505.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1506	ld1	{v24.2d},[x3],#16
1507	add	v25.2d,v25.2d,v17.2d
1508	ld1	{v17.16b},[x1],#16		// load next input
1509	ext	v25.16b,v25.16b,v25.16b,#8
1510	ext	v5.16b,v0.16b,v1.16b,#8
1511	ext	v6.16b,v2.16b,v0.16b,#8
1512	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1513.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1514	rev64	v17.16b,v17.16b
1515	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1516.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1517	ld1	{v25.2d},[x3],#16
1518	add	v24.2d,v24.2d,v18.2d
1519	ld1	{v18.16b},[x1],#16		// load next input
1520	ext	v24.16b,v24.16b,v24.16b,#8
1521	ext	v5.16b,v3.16b,v0.16b,#8
1522	ext	v6.16b,v4.16b,v3.16b,#8
1523	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1524.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1525	rev64	v18.16b,v18.16b
1526	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1527.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1528	ld1	{v24.2d},[x3],#16
1529	add	v25.2d,v25.2d,v19.2d
1530	ld1	{v19.16b},[x1],#16		// load next input
1531	ext	v25.16b,v25.16b,v25.16b,#8
1532	ext	v5.16b,v2.16b,v3.16b,#8
1533	ext	v6.16b,v1.16b,v2.16b,#8
1534	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1535.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1536	rev64	v19.16b,v19.16b
1537	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1538.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1539	ld1	{v25.2d},[x3],#16
1540	add	v24.2d,v24.2d,v20.2d
1541	ld1	{v20.16b},[x1],#16		// load next input
1542	ext	v24.16b,v24.16b,v24.16b,#8
1543	ext	v5.16b,v4.16b,v2.16b,#8
1544	ext	v6.16b,v0.16b,v4.16b,#8
1545	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1546.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1547	rev64	v20.16b,v20.16b
1548	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1549.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1550	ld1	{v24.2d},[x3],#16
1551	add	v25.2d,v25.2d,v21.2d
1552	ld1	{v21.16b},[x1],#16		// load next input
1553	ext	v25.16b,v25.16b,v25.16b,#8
1554	ext	v5.16b,v1.16b,v4.16b,#8
1555	ext	v6.16b,v3.16b,v1.16b,#8
1556	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1557.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1558	rev64	v21.16b,v21.16b
1559	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1560.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1561	ld1	{v25.2d},[x3],#16
1562	add	v24.2d,v24.2d,v22.2d
1563	ld1	{v22.16b},[x1],#16		// load next input
1564	ext	v24.16b,v24.16b,v24.16b,#8
1565	ext	v5.16b,v0.16b,v1.16b,#8
1566	ext	v6.16b,v2.16b,v0.16b,#8
1567	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1568.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1569	rev64	v22.16b,v22.16b
1570	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1571.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1572	sub	x3,x3,#80*8	// rewind
1573	add	v25.2d,v25.2d,v23.2d
1574	ld1	{v23.16b},[x1],#16		// load next input
1575	ext	v25.16b,v25.16b,v25.16b,#8
1576	ext	v5.16b,v3.16b,v0.16b,#8
1577	ext	v6.16b,v4.16b,v3.16b,#8
1578	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1579.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1580	rev64	v23.16b,v23.16b
1581	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1582.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1583	add	v0.2d,v0.2d,v26.2d			// accumulate
1584	add	v1.2d,v1.2d,v27.2d
1585	add	v2.2d,v2.2d,v28.2d
1586	add	v3.2d,v3.2d,v29.2d
1587
1588	cbnz	x2,.Loop_hw
1589
1590	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1591
1592	ldr	x29,[sp],#16
1593	ret
1594.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
1595#endif
1596#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1597