xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/crypto/chacha-armv8-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <openssl/arm_arch.h>
8
9.section	.rodata
10
11.align	5
12.Lsigma:
13.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
14.Lone:
15.long	1,0,0,0
16.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
17.align	2
18
19.text
20
21.globl	ChaCha20_ctr32_nohw
22.hidden	ChaCha20_ctr32_nohw
23.type	ChaCha20_ctr32_nohw,%function
24.align	5
25ChaCha20_ctr32_nohw:
26	AARCH64_SIGN_LINK_REGISTER
27	stp	x29,x30,[sp,#-96]!
28	add	x29,sp,#0
29
30	adrp	x5,.Lsigma
31	add	x5,x5,:lo12:.Lsigma
32	stp	x19,x20,[sp,#16]
33	stp	x21,x22,[sp,#32]
34	stp	x23,x24,[sp,#48]
35	stp	x25,x26,[sp,#64]
36	stp	x27,x28,[sp,#80]
37	sub	sp,sp,#64
38
39	ldp	x22,x23,[x5]		// load sigma
40	ldp	x24,x25,[x3]		// load key
41	ldp	x26,x27,[x3,#16]
42	ldp	x28,x30,[x4]		// load counter
43#ifdef	__AARCH64EB__
44	ror	x24,x24,#32
45	ror	x25,x25,#32
46	ror	x26,x26,#32
47	ror	x27,x27,#32
48	ror	x28,x28,#32
49	ror	x30,x30,#32
50#endif
51
52.Loop_outer:
53	mov	w5,w22			// unpack key block
54	lsr	x6,x22,#32
55	mov	w7,w23
56	lsr	x8,x23,#32
57	mov	w9,w24
58	lsr	x10,x24,#32
59	mov	w11,w25
60	lsr	x12,x25,#32
61	mov	w13,w26
62	lsr	x14,x26,#32
63	mov	w15,w27
64	lsr	x16,x27,#32
65	mov	w17,w28
66	lsr	x19,x28,#32
67	mov	w20,w30
68	lsr	x21,x30,#32
69
70	mov	x4,#10
71	subs	x2,x2,#64
72.Loop:
73	sub	x4,x4,#1
74	add	w5,w5,w9
75	add	w6,w6,w10
76	add	w7,w7,w11
77	add	w8,w8,w12
78	eor	w17,w17,w5
79	eor	w19,w19,w6
80	eor	w20,w20,w7
81	eor	w21,w21,w8
82	ror	w17,w17,#16
83	ror	w19,w19,#16
84	ror	w20,w20,#16
85	ror	w21,w21,#16
86	add	w13,w13,w17
87	add	w14,w14,w19
88	add	w15,w15,w20
89	add	w16,w16,w21
90	eor	w9,w9,w13
91	eor	w10,w10,w14
92	eor	w11,w11,w15
93	eor	w12,w12,w16
94	ror	w9,w9,#20
95	ror	w10,w10,#20
96	ror	w11,w11,#20
97	ror	w12,w12,#20
98	add	w5,w5,w9
99	add	w6,w6,w10
100	add	w7,w7,w11
101	add	w8,w8,w12
102	eor	w17,w17,w5
103	eor	w19,w19,w6
104	eor	w20,w20,w7
105	eor	w21,w21,w8
106	ror	w17,w17,#24
107	ror	w19,w19,#24
108	ror	w20,w20,#24
109	ror	w21,w21,#24
110	add	w13,w13,w17
111	add	w14,w14,w19
112	add	w15,w15,w20
113	add	w16,w16,w21
114	eor	w9,w9,w13
115	eor	w10,w10,w14
116	eor	w11,w11,w15
117	eor	w12,w12,w16
118	ror	w9,w9,#25
119	ror	w10,w10,#25
120	ror	w11,w11,#25
121	ror	w12,w12,#25
122	add	w5,w5,w10
123	add	w6,w6,w11
124	add	w7,w7,w12
125	add	w8,w8,w9
126	eor	w21,w21,w5
127	eor	w17,w17,w6
128	eor	w19,w19,w7
129	eor	w20,w20,w8
130	ror	w21,w21,#16
131	ror	w17,w17,#16
132	ror	w19,w19,#16
133	ror	w20,w20,#16
134	add	w15,w15,w21
135	add	w16,w16,w17
136	add	w13,w13,w19
137	add	w14,w14,w20
138	eor	w10,w10,w15
139	eor	w11,w11,w16
140	eor	w12,w12,w13
141	eor	w9,w9,w14
142	ror	w10,w10,#20
143	ror	w11,w11,#20
144	ror	w12,w12,#20
145	ror	w9,w9,#20
146	add	w5,w5,w10
147	add	w6,w6,w11
148	add	w7,w7,w12
149	add	w8,w8,w9
150	eor	w21,w21,w5
151	eor	w17,w17,w6
152	eor	w19,w19,w7
153	eor	w20,w20,w8
154	ror	w21,w21,#24
155	ror	w17,w17,#24
156	ror	w19,w19,#24
157	ror	w20,w20,#24
158	add	w15,w15,w21
159	add	w16,w16,w17
160	add	w13,w13,w19
161	add	w14,w14,w20
162	eor	w10,w10,w15
163	eor	w11,w11,w16
164	eor	w12,w12,w13
165	eor	w9,w9,w14
166	ror	w10,w10,#25
167	ror	w11,w11,#25
168	ror	w12,w12,#25
169	ror	w9,w9,#25
170	cbnz	x4,.Loop
171
172	add	w5,w5,w22		// accumulate key block
173	add	x6,x6,x22,lsr#32
174	add	w7,w7,w23
175	add	x8,x8,x23,lsr#32
176	add	w9,w9,w24
177	add	x10,x10,x24,lsr#32
178	add	w11,w11,w25
179	add	x12,x12,x25,lsr#32
180	add	w13,w13,w26
181	add	x14,x14,x26,lsr#32
182	add	w15,w15,w27
183	add	x16,x16,x27,lsr#32
184	add	w17,w17,w28
185	add	x19,x19,x28,lsr#32
186	add	w20,w20,w30
187	add	x21,x21,x30,lsr#32
188
189	b.lo	.Ltail
190
191	add	x5,x5,x6,lsl#32	// pack
192	add	x7,x7,x8,lsl#32
193	ldp	x6,x8,[x1,#0]		// load input
194	add	x9,x9,x10,lsl#32
195	add	x11,x11,x12,lsl#32
196	ldp	x10,x12,[x1,#16]
197	add	x13,x13,x14,lsl#32
198	add	x15,x15,x16,lsl#32
199	ldp	x14,x16,[x1,#32]
200	add	x17,x17,x19,lsl#32
201	add	x20,x20,x21,lsl#32
202	ldp	x19,x21,[x1,#48]
203	add	x1,x1,#64
204#ifdef	__AARCH64EB__
205	rev	x5,x5
206	rev	x7,x7
207	rev	x9,x9
208	rev	x11,x11
209	rev	x13,x13
210	rev	x15,x15
211	rev	x17,x17
212	rev	x20,x20
213#endif
214	eor	x5,x5,x6
215	eor	x7,x7,x8
216	eor	x9,x9,x10
217	eor	x11,x11,x12
218	eor	x13,x13,x14
219	eor	x15,x15,x16
220	eor	x17,x17,x19
221	eor	x20,x20,x21
222
223	stp	x5,x7,[x0,#0]		// store output
224	add	x28,x28,#1			// increment counter
225	stp	x9,x11,[x0,#16]
226	stp	x13,x15,[x0,#32]
227	stp	x17,x20,[x0,#48]
228	add	x0,x0,#64
229
230	b.hi	.Loop_outer
231
232	ldp	x19,x20,[x29,#16]
233	add	sp,sp,#64
234	ldp	x21,x22,[x29,#32]
235	ldp	x23,x24,[x29,#48]
236	ldp	x25,x26,[x29,#64]
237	ldp	x27,x28,[x29,#80]
238	ldp	x29,x30,[sp],#96
239	AARCH64_VALIDATE_LINK_REGISTER
240	ret
241
242.align	4
243.Ltail:
244	add	x2,x2,#64
245.Less_than_64:
246	sub	x0,x0,#1
247	add	x1,x1,x2
248	add	x0,x0,x2
249	add	x4,sp,x2
250	neg	x2,x2
251
252	add	x5,x5,x6,lsl#32	// pack
253	add	x7,x7,x8,lsl#32
254	add	x9,x9,x10,lsl#32
255	add	x11,x11,x12,lsl#32
256	add	x13,x13,x14,lsl#32
257	add	x15,x15,x16,lsl#32
258	add	x17,x17,x19,lsl#32
259	add	x20,x20,x21,lsl#32
260#ifdef	__AARCH64EB__
261	rev	x5,x5
262	rev	x7,x7
263	rev	x9,x9
264	rev	x11,x11
265	rev	x13,x13
266	rev	x15,x15
267	rev	x17,x17
268	rev	x20,x20
269#endif
270	stp	x5,x7,[sp,#0]
271	stp	x9,x11,[sp,#16]
272	stp	x13,x15,[sp,#32]
273	stp	x17,x20,[sp,#48]
274
275.Loop_tail:
276	ldrb	w10,[x1,x2]
277	ldrb	w11,[x4,x2]
278	add	x2,x2,#1
279	eor	w10,w10,w11
280	strb	w10,[x0,x2]
281	cbnz	x2,.Loop_tail
282
283	stp	xzr,xzr,[sp,#0]
284	stp	xzr,xzr,[sp,#16]
285	stp	xzr,xzr,[sp,#32]
286	stp	xzr,xzr,[sp,#48]
287
288	ldp	x19,x20,[x29,#16]
289	add	sp,sp,#64
290	ldp	x21,x22,[x29,#32]
291	ldp	x23,x24,[x29,#48]
292	ldp	x25,x26,[x29,#64]
293	ldp	x27,x28,[x29,#80]
294	ldp	x29,x30,[sp],#96
295	AARCH64_VALIDATE_LINK_REGISTER
296	ret
297.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
298
299.globl	ChaCha20_ctr32_neon
300.hidden	ChaCha20_ctr32_neon
301.type	ChaCha20_ctr32_neon,%function
302.align	5
303ChaCha20_ctr32_neon:
304	AARCH64_SIGN_LINK_REGISTER
305	stp	x29,x30,[sp,#-96]!
306	add	x29,sp,#0
307
308	adrp	x5,.Lsigma
309	add	x5,x5,:lo12:.Lsigma
310	stp	x19,x20,[sp,#16]
311	stp	x21,x22,[sp,#32]
312	stp	x23,x24,[sp,#48]
313	stp	x25,x26,[sp,#64]
314	stp	x27,x28,[sp,#80]
315	cmp	x2,#512
316	b.hs	.L512_or_more_neon
317
318	sub	sp,sp,#64
319
320	ldp	x22,x23,[x5]		// load sigma
321	ld1	{v24.4s},[x5],#16
322	ldp	x24,x25,[x3]		// load key
323	ldp	x26,x27,[x3,#16]
324	ld1	{v25.4s,v26.4s},[x3]
325	ldp	x28,x30,[x4]		// load counter
326	ld1	{v27.4s},[x4]
327	ld1	{v31.4s},[x5]
328#ifdef	__AARCH64EB__
329	rev64	v24.4s,v24.4s
330	ror	x24,x24,#32
331	ror	x25,x25,#32
332	ror	x26,x26,#32
333	ror	x27,x27,#32
334	ror	x28,x28,#32
335	ror	x30,x30,#32
336#endif
337	add	v27.4s,v27.4s,v31.4s		// += 1
338	add	v28.4s,v27.4s,v31.4s
339	add	v29.4s,v28.4s,v31.4s
340	shl	v31.4s,v31.4s,#2			// 1 -> 4
341
342.Loop_outer_neon:
343	mov	w5,w22			// unpack key block
344	lsr	x6,x22,#32
345	mov	v0.16b,v24.16b
346	mov	w7,w23
347	lsr	x8,x23,#32
348	mov	v4.16b,v24.16b
349	mov	w9,w24
350	lsr	x10,x24,#32
351	mov	v16.16b,v24.16b
352	mov	w11,w25
353	mov	v1.16b,v25.16b
354	lsr	x12,x25,#32
355	mov	v5.16b,v25.16b
356	mov	w13,w26
357	mov	v17.16b,v25.16b
358	lsr	x14,x26,#32
359	mov	v3.16b,v27.16b
360	mov	w15,w27
361	mov	v7.16b,v28.16b
362	lsr	x16,x27,#32
363	mov	v19.16b,v29.16b
364	mov	w17,w28
365	mov	v2.16b,v26.16b
366	lsr	x19,x28,#32
367	mov	v6.16b,v26.16b
368	mov	w20,w30
369	mov	v18.16b,v26.16b
370	lsr	x21,x30,#32
371
372	mov	x4,#10
373	subs	x2,x2,#256
374.Loop_neon:
375	sub	x4,x4,#1
376	add	v0.4s,v0.4s,v1.4s
377	add	w5,w5,w9
378	add	v4.4s,v4.4s,v5.4s
379	add	w6,w6,w10
380	add	v16.4s,v16.4s,v17.4s
381	add	w7,w7,w11
382	eor	v3.16b,v3.16b,v0.16b
383	add	w8,w8,w12
384	eor	v7.16b,v7.16b,v4.16b
385	eor	w17,w17,w5
386	eor	v19.16b,v19.16b,v16.16b
387	eor	w19,w19,w6
388	rev32	v3.8h,v3.8h
389	eor	w20,w20,w7
390	rev32	v7.8h,v7.8h
391	eor	w21,w21,w8
392	rev32	v19.8h,v19.8h
393	ror	w17,w17,#16
394	add	v2.4s,v2.4s,v3.4s
395	ror	w19,w19,#16
396	add	v6.4s,v6.4s,v7.4s
397	ror	w20,w20,#16
398	add	v18.4s,v18.4s,v19.4s
399	ror	w21,w21,#16
400	eor	v20.16b,v1.16b,v2.16b
401	add	w13,w13,w17
402	eor	v21.16b,v5.16b,v6.16b
403	add	w14,w14,w19
404	eor	v22.16b,v17.16b,v18.16b
405	add	w15,w15,w20
406	ushr	v1.4s,v20.4s,#20
407	add	w16,w16,w21
408	ushr	v5.4s,v21.4s,#20
409	eor	w9,w9,w13
410	ushr	v17.4s,v22.4s,#20
411	eor	w10,w10,w14
412	sli	v1.4s,v20.4s,#12
413	eor	w11,w11,w15
414	sli	v5.4s,v21.4s,#12
415	eor	w12,w12,w16
416	sli	v17.4s,v22.4s,#12
417	ror	w9,w9,#20
418	add	v0.4s,v0.4s,v1.4s
419	ror	w10,w10,#20
420	add	v4.4s,v4.4s,v5.4s
421	ror	w11,w11,#20
422	add	v16.4s,v16.4s,v17.4s
423	ror	w12,w12,#20
424	eor	v20.16b,v3.16b,v0.16b
425	add	w5,w5,w9
426	eor	v21.16b,v7.16b,v4.16b
427	add	w6,w6,w10
428	eor	v22.16b,v19.16b,v16.16b
429	add	w7,w7,w11
430	ushr	v3.4s,v20.4s,#24
431	add	w8,w8,w12
432	ushr	v7.4s,v21.4s,#24
433	eor	w17,w17,w5
434	ushr	v19.4s,v22.4s,#24
435	eor	w19,w19,w6
436	sli	v3.4s,v20.4s,#8
437	eor	w20,w20,w7
438	sli	v7.4s,v21.4s,#8
439	eor	w21,w21,w8
440	sli	v19.4s,v22.4s,#8
441	ror	w17,w17,#24
442	add	v2.4s,v2.4s,v3.4s
443	ror	w19,w19,#24
444	add	v6.4s,v6.4s,v7.4s
445	ror	w20,w20,#24
446	add	v18.4s,v18.4s,v19.4s
447	ror	w21,w21,#24
448	eor	v20.16b,v1.16b,v2.16b
449	add	w13,w13,w17
450	eor	v21.16b,v5.16b,v6.16b
451	add	w14,w14,w19
452	eor	v22.16b,v17.16b,v18.16b
453	add	w15,w15,w20
454	ushr	v1.4s,v20.4s,#25
455	add	w16,w16,w21
456	ushr	v5.4s,v21.4s,#25
457	eor	w9,w9,w13
458	ushr	v17.4s,v22.4s,#25
459	eor	w10,w10,w14
460	sli	v1.4s,v20.4s,#7
461	eor	w11,w11,w15
462	sli	v5.4s,v21.4s,#7
463	eor	w12,w12,w16
464	sli	v17.4s,v22.4s,#7
465	ror	w9,w9,#25
466	ext	v2.16b,v2.16b,v2.16b,#8
467	ror	w10,w10,#25
468	ext	v6.16b,v6.16b,v6.16b,#8
469	ror	w11,w11,#25
470	ext	v18.16b,v18.16b,v18.16b,#8
471	ror	w12,w12,#25
472	ext	v3.16b,v3.16b,v3.16b,#12
473	ext	v7.16b,v7.16b,v7.16b,#12
474	ext	v19.16b,v19.16b,v19.16b,#12
475	ext	v1.16b,v1.16b,v1.16b,#4
476	ext	v5.16b,v5.16b,v5.16b,#4
477	ext	v17.16b,v17.16b,v17.16b,#4
478	add	v0.4s,v0.4s,v1.4s
479	add	w5,w5,w10
480	add	v4.4s,v4.4s,v5.4s
481	add	w6,w6,w11
482	add	v16.4s,v16.4s,v17.4s
483	add	w7,w7,w12
484	eor	v3.16b,v3.16b,v0.16b
485	add	w8,w8,w9
486	eor	v7.16b,v7.16b,v4.16b
487	eor	w21,w21,w5
488	eor	v19.16b,v19.16b,v16.16b
489	eor	w17,w17,w6
490	rev32	v3.8h,v3.8h
491	eor	w19,w19,w7
492	rev32	v7.8h,v7.8h
493	eor	w20,w20,w8
494	rev32	v19.8h,v19.8h
495	ror	w21,w21,#16
496	add	v2.4s,v2.4s,v3.4s
497	ror	w17,w17,#16
498	add	v6.4s,v6.4s,v7.4s
499	ror	w19,w19,#16
500	add	v18.4s,v18.4s,v19.4s
501	ror	w20,w20,#16
502	eor	v20.16b,v1.16b,v2.16b
503	add	w15,w15,w21
504	eor	v21.16b,v5.16b,v6.16b
505	add	w16,w16,w17
506	eor	v22.16b,v17.16b,v18.16b
507	add	w13,w13,w19
508	ushr	v1.4s,v20.4s,#20
509	add	w14,w14,w20
510	ushr	v5.4s,v21.4s,#20
511	eor	w10,w10,w15
512	ushr	v17.4s,v22.4s,#20
513	eor	w11,w11,w16
514	sli	v1.4s,v20.4s,#12
515	eor	w12,w12,w13
516	sli	v5.4s,v21.4s,#12
517	eor	w9,w9,w14
518	sli	v17.4s,v22.4s,#12
519	ror	w10,w10,#20
520	add	v0.4s,v0.4s,v1.4s
521	ror	w11,w11,#20
522	add	v4.4s,v4.4s,v5.4s
523	ror	w12,w12,#20
524	add	v16.4s,v16.4s,v17.4s
525	ror	w9,w9,#20
526	eor	v20.16b,v3.16b,v0.16b
527	add	w5,w5,w10
528	eor	v21.16b,v7.16b,v4.16b
529	add	w6,w6,w11
530	eor	v22.16b,v19.16b,v16.16b
531	add	w7,w7,w12
532	ushr	v3.4s,v20.4s,#24
533	add	w8,w8,w9
534	ushr	v7.4s,v21.4s,#24
535	eor	w21,w21,w5
536	ushr	v19.4s,v22.4s,#24
537	eor	w17,w17,w6
538	sli	v3.4s,v20.4s,#8
539	eor	w19,w19,w7
540	sli	v7.4s,v21.4s,#8
541	eor	w20,w20,w8
542	sli	v19.4s,v22.4s,#8
543	ror	w21,w21,#24
544	add	v2.4s,v2.4s,v3.4s
545	ror	w17,w17,#24
546	add	v6.4s,v6.4s,v7.4s
547	ror	w19,w19,#24
548	add	v18.4s,v18.4s,v19.4s
549	ror	w20,w20,#24
550	eor	v20.16b,v1.16b,v2.16b
551	add	w15,w15,w21
552	eor	v21.16b,v5.16b,v6.16b
553	add	w16,w16,w17
554	eor	v22.16b,v17.16b,v18.16b
555	add	w13,w13,w19
556	ushr	v1.4s,v20.4s,#25
557	add	w14,w14,w20
558	ushr	v5.4s,v21.4s,#25
559	eor	w10,w10,w15
560	ushr	v17.4s,v22.4s,#25
561	eor	w11,w11,w16
562	sli	v1.4s,v20.4s,#7
563	eor	w12,w12,w13
564	sli	v5.4s,v21.4s,#7
565	eor	w9,w9,w14
566	sli	v17.4s,v22.4s,#7
567	ror	w10,w10,#25
568	ext	v2.16b,v2.16b,v2.16b,#8
569	ror	w11,w11,#25
570	ext	v6.16b,v6.16b,v6.16b,#8
571	ror	w12,w12,#25
572	ext	v18.16b,v18.16b,v18.16b,#8
573	ror	w9,w9,#25
574	ext	v3.16b,v3.16b,v3.16b,#4
575	ext	v7.16b,v7.16b,v7.16b,#4
576	ext	v19.16b,v19.16b,v19.16b,#4
577	ext	v1.16b,v1.16b,v1.16b,#12
578	ext	v5.16b,v5.16b,v5.16b,#12
579	ext	v17.16b,v17.16b,v17.16b,#12
580	cbnz	x4,.Loop_neon
581
582	add	w5,w5,w22		// accumulate key block
583	add	v0.4s,v0.4s,v24.4s
584	add	x6,x6,x22,lsr#32
585	add	v4.4s,v4.4s,v24.4s
586	add	w7,w7,w23
587	add	v16.4s,v16.4s,v24.4s
588	add	x8,x8,x23,lsr#32
589	add	v2.4s,v2.4s,v26.4s
590	add	w9,w9,w24
591	add	v6.4s,v6.4s,v26.4s
592	add	x10,x10,x24,lsr#32
593	add	v18.4s,v18.4s,v26.4s
594	add	w11,w11,w25
595	add	v3.4s,v3.4s,v27.4s
596	add	x12,x12,x25,lsr#32
597	add	w13,w13,w26
598	add	v7.4s,v7.4s,v28.4s
599	add	x14,x14,x26,lsr#32
600	add	w15,w15,w27
601	add	v19.4s,v19.4s,v29.4s
602	add	x16,x16,x27,lsr#32
603	add	w17,w17,w28
604	add	v1.4s,v1.4s,v25.4s
605	add	x19,x19,x28,lsr#32
606	add	w20,w20,w30
607	add	v5.4s,v5.4s,v25.4s
608	add	x21,x21,x30,lsr#32
609	add	v17.4s,v17.4s,v25.4s
610
611	b.lo	.Ltail_neon
612
613	add	x5,x5,x6,lsl#32	// pack
614	add	x7,x7,x8,lsl#32
615	ldp	x6,x8,[x1,#0]		// load input
616	add	x9,x9,x10,lsl#32
617	add	x11,x11,x12,lsl#32
618	ldp	x10,x12,[x1,#16]
619	add	x13,x13,x14,lsl#32
620	add	x15,x15,x16,lsl#32
621	ldp	x14,x16,[x1,#32]
622	add	x17,x17,x19,lsl#32
623	add	x20,x20,x21,lsl#32
624	ldp	x19,x21,[x1,#48]
625	add	x1,x1,#64
626#ifdef	__AARCH64EB__
627	rev	x5,x5
628	rev	x7,x7
629	rev	x9,x9
630	rev	x11,x11
631	rev	x13,x13
632	rev	x15,x15
633	rev	x17,x17
634	rev	x20,x20
635#endif
636	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
637	eor	x5,x5,x6
638	eor	x7,x7,x8
639	eor	x9,x9,x10
640	eor	x11,x11,x12
641	eor	x13,x13,x14
642	eor	v0.16b,v0.16b,v20.16b
643	eor	x15,x15,x16
644	eor	v1.16b,v1.16b,v21.16b
645	eor	x17,x17,x19
646	eor	v2.16b,v2.16b,v22.16b
647	eor	x20,x20,x21
648	eor	v3.16b,v3.16b,v23.16b
649	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
650
651	stp	x5,x7,[x0,#0]		// store output
652	add	x28,x28,#4			// increment counter
653	stp	x9,x11,[x0,#16]
654	add	v27.4s,v27.4s,v31.4s		// += 4
655	stp	x13,x15,[x0,#32]
656	add	v28.4s,v28.4s,v31.4s
657	stp	x17,x20,[x0,#48]
658	add	v29.4s,v29.4s,v31.4s
659	add	x0,x0,#64
660
661	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
662	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
663
664	eor	v4.16b,v4.16b,v20.16b
665	eor	v5.16b,v5.16b,v21.16b
666	eor	v6.16b,v6.16b,v22.16b
667	eor	v7.16b,v7.16b,v23.16b
668	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
669
670	eor	v16.16b,v16.16b,v0.16b
671	eor	v17.16b,v17.16b,v1.16b
672	eor	v18.16b,v18.16b,v2.16b
673	eor	v19.16b,v19.16b,v3.16b
674	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
675
676	b.hi	.Loop_outer_neon
677
678	ldp	x19,x20,[x29,#16]
679	add	sp,sp,#64
680	ldp	x21,x22,[x29,#32]
681	ldp	x23,x24,[x29,#48]
682	ldp	x25,x26,[x29,#64]
683	ldp	x27,x28,[x29,#80]
684	ldp	x29,x30,[sp],#96
685	AARCH64_VALIDATE_LINK_REGISTER
686	ret
687
688.Ltail_neon:
689	add	x2,x2,#256
690	cmp	x2,#64
691	b.lo	.Less_than_64
692
693	add	x5,x5,x6,lsl#32	// pack
694	add	x7,x7,x8,lsl#32
695	ldp	x6,x8,[x1,#0]		// load input
696	add	x9,x9,x10,lsl#32
697	add	x11,x11,x12,lsl#32
698	ldp	x10,x12,[x1,#16]
699	add	x13,x13,x14,lsl#32
700	add	x15,x15,x16,lsl#32
701	ldp	x14,x16,[x1,#32]
702	add	x17,x17,x19,lsl#32
703	add	x20,x20,x21,lsl#32
704	ldp	x19,x21,[x1,#48]
705	add	x1,x1,#64
706#ifdef	__AARCH64EB__
707	rev	x5,x5
708	rev	x7,x7
709	rev	x9,x9
710	rev	x11,x11
711	rev	x13,x13
712	rev	x15,x15
713	rev	x17,x17
714	rev	x20,x20
715#endif
716	eor	x5,x5,x6
717	eor	x7,x7,x8
718	eor	x9,x9,x10
719	eor	x11,x11,x12
720	eor	x13,x13,x14
721	eor	x15,x15,x16
722	eor	x17,x17,x19
723	eor	x20,x20,x21
724
725	stp	x5,x7,[x0,#0]		// store output
726	add	x28,x28,#4			// increment counter
727	stp	x9,x11,[x0,#16]
728	stp	x13,x15,[x0,#32]
729	stp	x17,x20,[x0,#48]
730	add	x0,x0,#64
731	b.eq	.Ldone_neon
732	sub	x2,x2,#64
733	cmp	x2,#64
734	b.lo	.Less_than_128
735
736	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
737	eor	v0.16b,v0.16b,v20.16b
738	eor	v1.16b,v1.16b,v21.16b
739	eor	v2.16b,v2.16b,v22.16b
740	eor	v3.16b,v3.16b,v23.16b
741	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
742	b.eq	.Ldone_neon
743	sub	x2,x2,#64
744	cmp	x2,#64
745	b.lo	.Less_than_192
746
747	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
748	eor	v4.16b,v4.16b,v20.16b
749	eor	v5.16b,v5.16b,v21.16b
750	eor	v6.16b,v6.16b,v22.16b
751	eor	v7.16b,v7.16b,v23.16b
752	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
753	b.eq	.Ldone_neon
754	sub	x2,x2,#64
755
756	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
757	b	.Last_neon
758
759.Less_than_128:
760	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
761	b	.Last_neon
762.Less_than_192:
763	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
764	b	.Last_neon
765
766.align	4
767.Last_neon:
768	sub	x0,x0,#1
769	add	x1,x1,x2
770	add	x0,x0,x2
771	add	x4,sp,x2
772	neg	x2,x2
773
774.Loop_tail_neon:
775	ldrb	w10,[x1,x2]
776	ldrb	w11,[x4,x2]
777	add	x2,x2,#1
778	eor	w10,w10,w11
779	strb	w10,[x0,x2]
780	cbnz	x2,.Loop_tail_neon
781
782	stp	xzr,xzr,[sp,#0]
783	stp	xzr,xzr,[sp,#16]
784	stp	xzr,xzr,[sp,#32]
785	stp	xzr,xzr,[sp,#48]
786
787.Ldone_neon:
788	ldp	x19,x20,[x29,#16]
789	add	sp,sp,#64
790	ldp	x21,x22,[x29,#32]
791	ldp	x23,x24,[x29,#48]
792	ldp	x25,x26,[x29,#64]
793	ldp	x27,x28,[x29,#80]
794	ldp	x29,x30,[sp],#96
795	AARCH64_VALIDATE_LINK_REGISTER
796	ret
797.size	ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
798.type	ChaCha20_512_neon,%function
799.align	5
800ChaCha20_512_neon:
801	AARCH64_SIGN_LINK_REGISTER
802	stp	x29,x30,[sp,#-96]!
803	add	x29,sp,#0
804
805	adrp	x5,.Lsigma
806	add	x5,x5,:lo12:.Lsigma
807	stp	x19,x20,[sp,#16]
808	stp	x21,x22,[sp,#32]
809	stp	x23,x24,[sp,#48]
810	stp	x25,x26,[sp,#64]
811	stp	x27,x28,[sp,#80]
812
813.L512_or_more_neon:
814	sub	sp,sp,#128+64
815
816	ldp	x22,x23,[x5]		// load sigma
817	ld1	{v24.4s},[x5],#16
818	ldp	x24,x25,[x3]		// load key
819	ldp	x26,x27,[x3,#16]
820	ld1	{v25.4s,v26.4s},[x3]
821	ldp	x28,x30,[x4]		// load counter
822	ld1	{v27.4s},[x4]
823	ld1	{v31.4s},[x5]
824#ifdef	__AARCH64EB__
825	rev64	v24.4s,v24.4s
826	ror	x24,x24,#32
827	ror	x25,x25,#32
828	ror	x26,x26,#32
829	ror	x27,x27,#32
830	ror	x28,x28,#32
831	ror	x30,x30,#32
832#endif
833	add	v27.4s,v27.4s,v31.4s		// += 1
834	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
835	add	v27.4s,v27.4s,v31.4s		// not typo
836	str	q26,[sp,#32]
837	add	v28.4s,v27.4s,v31.4s
838	add	v29.4s,v28.4s,v31.4s
839	add	v30.4s,v29.4s,v31.4s
840	shl	v31.4s,v31.4s,#2			// 1 -> 4
841
842	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
843	stp	d10,d11,[sp,#128+16]
844	stp	d12,d13,[sp,#128+32]
845	stp	d14,d15,[sp,#128+48]
846
847	sub	x2,x2,#512			// not typo
848
849.Loop_outer_512_neon:
850	mov	v0.16b,v24.16b
851	mov	v4.16b,v24.16b
852	mov	v8.16b,v24.16b
853	mov	v12.16b,v24.16b
854	mov	v16.16b,v24.16b
855	mov	v20.16b,v24.16b
856	mov	v1.16b,v25.16b
857	mov	w5,w22			// unpack key block
858	mov	v5.16b,v25.16b
859	lsr	x6,x22,#32
860	mov	v9.16b,v25.16b
861	mov	w7,w23
862	mov	v13.16b,v25.16b
863	lsr	x8,x23,#32
864	mov	v17.16b,v25.16b
865	mov	w9,w24
866	mov	v21.16b,v25.16b
867	lsr	x10,x24,#32
868	mov	v3.16b,v27.16b
869	mov	w11,w25
870	mov	v7.16b,v28.16b
871	lsr	x12,x25,#32
872	mov	v11.16b,v29.16b
873	mov	w13,w26
874	mov	v15.16b,v30.16b
875	lsr	x14,x26,#32
876	mov	v2.16b,v26.16b
877	mov	w15,w27
878	mov	v6.16b,v26.16b
879	lsr	x16,x27,#32
880	add	v19.4s,v3.4s,v31.4s			// +4
881	mov	w17,w28
882	add	v23.4s,v7.4s,v31.4s			// +4
883	lsr	x19,x28,#32
884	mov	v10.16b,v26.16b
885	mov	w20,w30
886	mov	v14.16b,v26.16b
887	lsr	x21,x30,#32
888	mov	v18.16b,v26.16b
889	stp	q27,q28,[sp,#48]		// off-load key block, variable part
890	mov	v22.16b,v26.16b
891	str	q29,[sp,#80]
892
893	mov	x4,#5
894	subs	x2,x2,#512
895.Loop_upper_neon:
896	sub	x4,x4,#1
897	add	v0.4s,v0.4s,v1.4s
898	add	w5,w5,w9
899	add	v4.4s,v4.4s,v5.4s
900	add	w6,w6,w10
901	add	v8.4s,v8.4s,v9.4s
902	add	w7,w7,w11
903	add	v12.4s,v12.4s,v13.4s
904	add	w8,w8,w12
905	add	v16.4s,v16.4s,v17.4s
906	eor	w17,w17,w5
907	add	v20.4s,v20.4s,v21.4s
908	eor	w19,w19,w6
909	eor	v3.16b,v3.16b,v0.16b
910	eor	w20,w20,w7
911	eor	v7.16b,v7.16b,v4.16b
912	eor	w21,w21,w8
913	eor	v11.16b,v11.16b,v8.16b
914	ror	w17,w17,#16
915	eor	v15.16b,v15.16b,v12.16b
916	ror	w19,w19,#16
917	eor	v19.16b,v19.16b,v16.16b
918	ror	w20,w20,#16
919	eor	v23.16b,v23.16b,v20.16b
920	ror	w21,w21,#16
921	rev32	v3.8h,v3.8h
922	add	w13,w13,w17
923	rev32	v7.8h,v7.8h
924	add	w14,w14,w19
925	rev32	v11.8h,v11.8h
926	add	w15,w15,w20
927	rev32	v15.8h,v15.8h
928	add	w16,w16,w21
929	rev32	v19.8h,v19.8h
930	eor	w9,w9,w13
931	rev32	v23.8h,v23.8h
932	eor	w10,w10,w14
933	add	v2.4s,v2.4s,v3.4s
934	eor	w11,w11,w15
935	add	v6.4s,v6.4s,v7.4s
936	eor	w12,w12,w16
937	add	v10.4s,v10.4s,v11.4s
938	ror	w9,w9,#20
939	add	v14.4s,v14.4s,v15.4s
940	ror	w10,w10,#20
941	add	v18.4s,v18.4s,v19.4s
942	ror	w11,w11,#20
943	add	v22.4s,v22.4s,v23.4s
944	ror	w12,w12,#20
945	eor	v24.16b,v1.16b,v2.16b
946	add	w5,w5,w9
947	eor	v25.16b,v5.16b,v6.16b
948	add	w6,w6,w10
949	eor	v26.16b,v9.16b,v10.16b
950	add	w7,w7,w11
951	eor	v27.16b,v13.16b,v14.16b
952	add	w8,w8,w12
953	eor	v28.16b,v17.16b,v18.16b
954	eor	w17,w17,w5
955	eor	v29.16b,v21.16b,v22.16b
956	eor	w19,w19,w6
957	ushr	v1.4s,v24.4s,#20
958	eor	w20,w20,w7
959	ushr	v5.4s,v25.4s,#20
960	eor	w21,w21,w8
961	ushr	v9.4s,v26.4s,#20
962	ror	w17,w17,#24
963	ushr	v13.4s,v27.4s,#20
964	ror	w19,w19,#24
965	ushr	v17.4s,v28.4s,#20
966	ror	w20,w20,#24
967	ushr	v21.4s,v29.4s,#20
968	ror	w21,w21,#24
969	sli	v1.4s,v24.4s,#12
970	add	w13,w13,w17
971	sli	v5.4s,v25.4s,#12
972	add	w14,w14,w19
973	sli	v9.4s,v26.4s,#12
974	add	w15,w15,w20
975	sli	v13.4s,v27.4s,#12
976	add	w16,w16,w21
977	sli	v17.4s,v28.4s,#12
978	eor	w9,w9,w13
979	sli	v21.4s,v29.4s,#12
980	eor	w10,w10,w14
981	add	v0.4s,v0.4s,v1.4s
982	eor	w11,w11,w15
983	add	v4.4s,v4.4s,v5.4s
984	eor	w12,w12,w16
985	add	v8.4s,v8.4s,v9.4s
986	ror	w9,w9,#25
987	add	v12.4s,v12.4s,v13.4s
988	ror	w10,w10,#25
989	add	v16.4s,v16.4s,v17.4s
990	ror	w11,w11,#25
991	add	v20.4s,v20.4s,v21.4s
992	ror	w12,w12,#25
993	eor	v24.16b,v3.16b,v0.16b
994	add	w5,w5,w10
995	eor	v25.16b,v7.16b,v4.16b
996	add	w6,w6,w11
997	eor	v26.16b,v11.16b,v8.16b
998	add	w7,w7,w12
999	eor	v27.16b,v15.16b,v12.16b
1000	add	w8,w8,w9
1001	eor	v28.16b,v19.16b,v16.16b
1002	eor	w21,w21,w5
1003	eor	v29.16b,v23.16b,v20.16b
1004	eor	w17,w17,w6
1005	ushr	v3.4s,v24.4s,#24
1006	eor	w19,w19,w7
1007	ushr	v7.4s,v25.4s,#24
1008	eor	w20,w20,w8
1009	ushr	v11.4s,v26.4s,#24
1010	ror	w21,w21,#16
1011	ushr	v15.4s,v27.4s,#24
1012	ror	w17,w17,#16
1013	ushr	v19.4s,v28.4s,#24
1014	ror	w19,w19,#16
1015	ushr	v23.4s,v29.4s,#24
1016	ror	w20,w20,#16
1017	sli	v3.4s,v24.4s,#8
1018	add	w15,w15,w21
1019	sli	v7.4s,v25.4s,#8
1020	add	w16,w16,w17
1021	sli	v11.4s,v26.4s,#8
1022	add	w13,w13,w19
1023	sli	v15.4s,v27.4s,#8
1024	add	w14,w14,w20
1025	sli	v19.4s,v28.4s,#8
1026	eor	w10,w10,w15
1027	sli	v23.4s,v29.4s,#8
1028	eor	w11,w11,w16
1029	add	v2.4s,v2.4s,v3.4s
1030	eor	w12,w12,w13
1031	add	v6.4s,v6.4s,v7.4s
1032	eor	w9,w9,w14
1033	add	v10.4s,v10.4s,v11.4s
1034	ror	w10,w10,#20
1035	add	v14.4s,v14.4s,v15.4s
1036	ror	w11,w11,#20
1037	add	v18.4s,v18.4s,v19.4s
1038	ror	w12,w12,#20
1039	add	v22.4s,v22.4s,v23.4s
1040	ror	w9,w9,#20
1041	eor	v24.16b,v1.16b,v2.16b
1042	add	w5,w5,w10
1043	eor	v25.16b,v5.16b,v6.16b
1044	add	w6,w6,w11
1045	eor	v26.16b,v9.16b,v10.16b
1046	add	w7,w7,w12
1047	eor	v27.16b,v13.16b,v14.16b
1048	add	w8,w8,w9
1049	eor	v28.16b,v17.16b,v18.16b
1050	eor	w21,w21,w5
1051	eor	v29.16b,v21.16b,v22.16b
1052	eor	w17,w17,w6
1053	ushr	v1.4s,v24.4s,#25
1054	eor	w19,w19,w7
1055	ushr	v5.4s,v25.4s,#25
1056	eor	w20,w20,w8
1057	ushr	v9.4s,v26.4s,#25
1058	ror	w21,w21,#24
1059	ushr	v13.4s,v27.4s,#25
1060	ror	w17,w17,#24
1061	ushr	v17.4s,v28.4s,#25
1062	ror	w19,w19,#24
1063	ushr	v21.4s,v29.4s,#25
1064	ror	w20,w20,#24
1065	sli	v1.4s,v24.4s,#7
1066	add	w15,w15,w21
1067	sli	v5.4s,v25.4s,#7
1068	add	w16,w16,w17
1069	sli	v9.4s,v26.4s,#7
1070	add	w13,w13,w19
1071	sli	v13.4s,v27.4s,#7
1072	add	w14,w14,w20
1073	sli	v17.4s,v28.4s,#7
1074	eor	w10,w10,w15
1075	sli	v21.4s,v29.4s,#7
1076	eor	w11,w11,w16
1077	ext	v2.16b,v2.16b,v2.16b,#8
1078	eor	w12,w12,w13
1079	ext	v6.16b,v6.16b,v6.16b,#8
1080	eor	w9,w9,w14
1081	ext	v10.16b,v10.16b,v10.16b,#8
1082	ror	w10,w10,#25
1083	ext	v14.16b,v14.16b,v14.16b,#8
1084	ror	w11,w11,#25
1085	ext	v18.16b,v18.16b,v18.16b,#8
1086	ror	w12,w12,#25
1087	ext	v22.16b,v22.16b,v22.16b,#8
1088	ror	w9,w9,#25
1089	ext	v3.16b,v3.16b,v3.16b,#12
1090	ext	v7.16b,v7.16b,v7.16b,#12
1091	ext	v11.16b,v11.16b,v11.16b,#12
1092	ext	v15.16b,v15.16b,v15.16b,#12
1093	ext	v19.16b,v19.16b,v19.16b,#12
1094	ext	v23.16b,v23.16b,v23.16b,#12
1095	ext	v1.16b,v1.16b,v1.16b,#4
1096	ext	v5.16b,v5.16b,v5.16b,#4
1097	ext	v9.16b,v9.16b,v9.16b,#4
1098	ext	v13.16b,v13.16b,v13.16b,#4
1099	ext	v17.16b,v17.16b,v17.16b,#4
1100	ext	v21.16b,v21.16b,v21.16b,#4
1101	add	v0.4s,v0.4s,v1.4s
1102	add	w5,w5,w9
1103	add	v4.4s,v4.4s,v5.4s
1104	add	w6,w6,w10
1105	add	v8.4s,v8.4s,v9.4s
1106	add	w7,w7,w11
1107	add	v12.4s,v12.4s,v13.4s
1108	add	w8,w8,w12
1109	add	v16.4s,v16.4s,v17.4s
1110	eor	w17,w17,w5
1111	add	v20.4s,v20.4s,v21.4s
1112	eor	w19,w19,w6
1113	eor	v3.16b,v3.16b,v0.16b
1114	eor	w20,w20,w7
1115	eor	v7.16b,v7.16b,v4.16b
1116	eor	w21,w21,w8
1117	eor	v11.16b,v11.16b,v8.16b
1118	ror	w17,w17,#16
1119	eor	v15.16b,v15.16b,v12.16b
1120	ror	w19,w19,#16
1121	eor	v19.16b,v19.16b,v16.16b
1122	ror	w20,w20,#16
1123	eor	v23.16b,v23.16b,v20.16b
1124	ror	w21,w21,#16
1125	rev32	v3.8h,v3.8h
1126	add	w13,w13,w17
1127	rev32	v7.8h,v7.8h
1128	add	w14,w14,w19
1129	rev32	v11.8h,v11.8h
1130	add	w15,w15,w20
1131	rev32	v15.8h,v15.8h
1132	add	w16,w16,w21
1133	rev32	v19.8h,v19.8h
1134	eor	w9,w9,w13
1135	rev32	v23.8h,v23.8h
1136	eor	w10,w10,w14
1137	add	v2.4s,v2.4s,v3.4s
1138	eor	w11,w11,w15
1139	add	v6.4s,v6.4s,v7.4s
1140	eor	w12,w12,w16
1141	add	v10.4s,v10.4s,v11.4s
1142	ror	w9,w9,#20
1143	add	v14.4s,v14.4s,v15.4s
1144	ror	w10,w10,#20
1145	add	v18.4s,v18.4s,v19.4s
1146	ror	w11,w11,#20
1147	add	v22.4s,v22.4s,v23.4s
1148	ror	w12,w12,#20
1149	eor	v24.16b,v1.16b,v2.16b
1150	add	w5,w5,w9
1151	eor	v25.16b,v5.16b,v6.16b
1152	add	w6,w6,w10
1153	eor	v26.16b,v9.16b,v10.16b
1154	add	w7,w7,w11
1155	eor	v27.16b,v13.16b,v14.16b
1156	add	w8,w8,w12
1157	eor	v28.16b,v17.16b,v18.16b
1158	eor	w17,w17,w5
1159	eor	v29.16b,v21.16b,v22.16b
1160	eor	w19,w19,w6
1161	ushr	v1.4s,v24.4s,#20
1162	eor	w20,w20,w7
1163	ushr	v5.4s,v25.4s,#20
1164	eor	w21,w21,w8
1165	ushr	v9.4s,v26.4s,#20
1166	ror	w17,w17,#24
1167	ushr	v13.4s,v27.4s,#20
1168	ror	w19,w19,#24
1169	ushr	v17.4s,v28.4s,#20
1170	ror	w20,w20,#24
1171	ushr	v21.4s,v29.4s,#20
1172	ror	w21,w21,#24
1173	sli	v1.4s,v24.4s,#12
1174	add	w13,w13,w17
1175	sli	v5.4s,v25.4s,#12
1176	add	w14,w14,w19
1177	sli	v9.4s,v26.4s,#12
1178	add	w15,w15,w20
1179	sli	v13.4s,v27.4s,#12
1180	add	w16,w16,w21
1181	sli	v17.4s,v28.4s,#12
1182	eor	w9,w9,w13
1183	sli	v21.4s,v29.4s,#12
1184	eor	w10,w10,w14
1185	add	v0.4s,v0.4s,v1.4s
1186	eor	w11,w11,w15
1187	add	v4.4s,v4.4s,v5.4s
1188	eor	w12,w12,w16
1189	add	v8.4s,v8.4s,v9.4s
1190	ror	w9,w9,#25
1191	add	v12.4s,v12.4s,v13.4s
1192	ror	w10,w10,#25
1193	add	v16.4s,v16.4s,v17.4s
1194	ror	w11,w11,#25
1195	add	v20.4s,v20.4s,v21.4s
1196	ror	w12,w12,#25
1197	eor	v24.16b,v3.16b,v0.16b
1198	add	w5,w5,w10
1199	eor	v25.16b,v7.16b,v4.16b
1200	add	w6,w6,w11
1201	eor	v26.16b,v11.16b,v8.16b
1202	add	w7,w7,w12
1203	eor	v27.16b,v15.16b,v12.16b
1204	add	w8,w8,w9
1205	eor	v28.16b,v19.16b,v16.16b
1206	eor	w21,w21,w5
1207	eor	v29.16b,v23.16b,v20.16b
1208	eor	w17,w17,w6
1209	ushr	v3.4s,v24.4s,#24
1210	eor	w19,w19,w7
1211	ushr	v7.4s,v25.4s,#24
1212	eor	w20,w20,w8
1213	ushr	v11.4s,v26.4s,#24
1214	ror	w21,w21,#16
1215	ushr	v15.4s,v27.4s,#24
1216	ror	w17,w17,#16
1217	ushr	v19.4s,v28.4s,#24
1218	ror	w19,w19,#16
1219	ushr	v23.4s,v29.4s,#24
1220	ror	w20,w20,#16
1221	sli	v3.4s,v24.4s,#8
1222	add	w15,w15,w21
1223	sli	v7.4s,v25.4s,#8
1224	add	w16,w16,w17
1225	sli	v11.4s,v26.4s,#8
1226	add	w13,w13,w19
1227	sli	v15.4s,v27.4s,#8
1228	add	w14,w14,w20
1229	sli	v19.4s,v28.4s,#8
1230	eor	w10,w10,w15
1231	sli	v23.4s,v29.4s,#8
1232	eor	w11,w11,w16
1233	add	v2.4s,v2.4s,v3.4s
1234	eor	w12,w12,w13
1235	add	v6.4s,v6.4s,v7.4s
1236	eor	w9,w9,w14
1237	add	v10.4s,v10.4s,v11.4s
1238	ror	w10,w10,#20
1239	add	v14.4s,v14.4s,v15.4s
1240	ror	w11,w11,#20
1241	add	v18.4s,v18.4s,v19.4s
1242	ror	w12,w12,#20
1243	add	v22.4s,v22.4s,v23.4s
1244	ror	w9,w9,#20
1245	eor	v24.16b,v1.16b,v2.16b
1246	add	w5,w5,w10
1247	eor	v25.16b,v5.16b,v6.16b
1248	add	w6,w6,w11
1249	eor	v26.16b,v9.16b,v10.16b
1250	add	w7,w7,w12
1251	eor	v27.16b,v13.16b,v14.16b
1252	add	w8,w8,w9
1253	eor	v28.16b,v17.16b,v18.16b
1254	eor	w21,w21,w5
1255	eor	v29.16b,v21.16b,v22.16b
1256	eor	w17,w17,w6
1257	ushr	v1.4s,v24.4s,#25
1258	eor	w19,w19,w7
1259	ushr	v5.4s,v25.4s,#25
1260	eor	w20,w20,w8
1261	ushr	v9.4s,v26.4s,#25
1262	ror	w21,w21,#24
1263	ushr	v13.4s,v27.4s,#25
1264	ror	w17,w17,#24
1265	ushr	v17.4s,v28.4s,#25
1266	ror	w19,w19,#24
1267	ushr	v21.4s,v29.4s,#25
1268	ror	w20,w20,#24
1269	sli	v1.4s,v24.4s,#7
1270	add	w15,w15,w21
1271	sli	v5.4s,v25.4s,#7
1272	add	w16,w16,w17
1273	sli	v9.4s,v26.4s,#7
1274	add	w13,w13,w19
1275	sli	v13.4s,v27.4s,#7
1276	add	w14,w14,w20
1277	sli	v17.4s,v28.4s,#7
1278	eor	w10,w10,w15
1279	sli	v21.4s,v29.4s,#7
1280	eor	w11,w11,w16
1281	ext	v2.16b,v2.16b,v2.16b,#8
1282	eor	w12,w12,w13
1283	ext	v6.16b,v6.16b,v6.16b,#8
1284	eor	w9,w9,w14
1285	ext	v10.16b,v10.16b,v10.16b,#8
1286	ror	w10,w10,#25
1287	ext	v14.16b,v14.16b,v14.16b,#8
1288	ror	w11,w11,#25
1289	ext	v18.16b,v18.16b,v18.16b,#8
1290	ror	w12,w12,#25
1291	ext	v22.16b,v22.16b,v22.16b,#8
1292	ror	w9,w9,#25
1293	ext	v3.16b,v3.16b,v3.16b,#4
1294	ext	v7.16b,v7.16b,v7.16b,#4
1295	ext	v11.16b,v11.16b,v11.16b,#4
1296	ext	v15.16b,v15.16b,v15.16b,#4
1297	ext	v19.16b,v19.16b,v19.16b,#4
1298	ext	v23.16b,v23.16b,v23.16b,#4
1299	ext	v1.16b,v1.16b,v1.16b,#12
1300	ext	v5.16b,v5.16b,v5.16b,#12
1301	ext	v9.16b,v9.16b,v9.16b,#12
1302	ext	v13.16b,v13.16b,v13.16b,#12
1303	ext	v17.16b,v17.16b,v17.16b,#12
1304	ext	v21.16b,v21.16b,v21.16b,#12
1305	cbnz	x4,.Loop_upper_neon
1306
1307	add	w5,w5,w22		// accumulate key block
1308	add	x6,x6,x22,lsr#32
1309	add	w7,w7,w23
1310	add	x8,x8,x23,lsr#32
1311	add	w9,w9,w24
1312	add	x10,x10,x24,lsr#32
1313	add	w11,w11,w25
1314	add	x12,x12,x25,lsr#32
1315	add	w13,w13,w26
1316	add	x14,x14,x26,lsr#32
1317	add	w15,w15,w27
1318	add	x16,x16,x27,lsr#32
1319	add	w17,w17,w28
1320	add	x19,x19,x28,lsr#32
1321	add	w20,w20,w30
1322	add	x21,x21,x30,lsr#32
1323
1324	add	x5,x5,x6,lsl#32	// pack
1325	add	x7,x7,x8,lsl#32
1326	ldp	x6,x8,[x1,#0]		// load input
1327	add	x9,x9,x10,lsl#32
1328	add	x11,x11,x12,lsl#32
1329	ldp	x10,x12,[x1,#16]
1330	add	x13,x13,x14,lsl#32
1331	add	x15,x15,x16,lsl#32
1332	ldp	x14,x16,[x1,#32]
1333	add	x17,x17,x19,lsl#32
1334	add	x20,x20,x21,lsl#32
1335	ldp	x19,x21,[x1,#48]
1336	add	x1,x1,#64
1337#ifdef	__AARCH64EB__
1338	rev	x5,x5
1339	rev	x7,x7
1340	rev	x9,x9
1341	rev	x11,x11
1342	rev	x13,x13
1343	rev	x15,x15
1344	rev	x17,x17
1345	rev	x20,x20
1346#endif
1347	eor	x5,x5,x6
1348	eor	x7,x7,x8
1349	eor	x9,x9,x10
1350	eor	x11,x11,x12
1351	eor	x13,x13,x14
1352	eor	x15,x15,x16
1353	eor	x17,x17,x19
1354	eor	x20,x20,x21
1355
1356	stp	x5,x7,[x0,#0]		// store output
1357	add	x28,x28,#1			// increment counter
1358	mov	w5,w22			// unpack key block
1359	lsr	x6,x22,#32
1360	stp	x9,x11,[x0,#16]
1361	mov	w7,w23
1362	lsr	x8,x23,#32
1363	stp	x13,x15,[x0,#32]
1364	mov	w9,w24
1365	lsr	x10,x24,#32
1366	stp	x17,x20,[x0,#48]
1367	add	x0,x0,#64
1368	mov	w11,w25
1369	lsr	x12,x25,#32
1370	mov	w13,w26
1371	lsr	x14,x26,#32
1372	mov	w15,w27
1373	lsr	x16,x27,#32
1374	mov	w17,w28
1375	lsr	x19,x28,#32
1376	mov	w20,w30
1377	lsr	x21,x30,#32
1378
1379	mov	x4,#5
1380.Loop_lower_neon:
1381	sub	x4,x4,#1
1382	add	v0.4s,v0.4s,v1.4s
1383	add	w5,w5,w9
1384	add	v4.4s,v4.4s,v5.4s
1385	add	w6,w6,w10
1386	add	v8.4s,v8.4s,v9.4s
1387	add	w7,w7,w11
1388	add	v12.4s,v12.4s,v13.4s
1389	add	w8,w8,w12
1390	add	v16.4s,v16.4s,v17.4s
1391	eor	w17,w17,w5
1392	add	v20.4s,v20.4s,v21.4s
1393	eor	w19,w19,w6
1394	eor	v3.16b,v3.16b,v0.16b
1395	eor	w20,w20,w7
1396	eor	v7.16b,v7.16b,v4.16b
1397	eor	w21,w21,w8
1398	eor	v11.16b,v11.16b,v8.16b
1399	ror	w17,w17,#16
1400	eor	v15.16b,v15.16b,v12.16b
1401	ror	w19,w19,#16
1402	eor	v19.16b,v19.16b,v16.16b
1403	ror	w20,w20,#16
1404	eor	v23.16b,v23.16b,v20.16b
1405	ror	w21,w21,#16
1406	rev32	v3.8h,v3.8h
1407	add	w13,w13,w17
1408	rev32	v7.8h,v7.8h
1409	add	w14,w14,w19
1410	rev32	v11.8h,v11.8h
1411	add	w15,w15,w20
1412	rev32	v15.8h,v15.8h
1413	add	w16,w16,w21
1414	rev32	v19.8h,v19.8h
1415	eor	w9,w9,w13
1416	rev32	v23.8h,v23.8h
1417	eor	w10,w10,w14
1418	add	v2.4s,v2.4s,v3.4s
1419	eor	w11,w11,w15
1420	add	v6.4s,v6.4s,v7.4s
1421	eor	w12,w12,w16
1422	add	v10.4s,v10.4s,v11.4s
1423	ror	w9,w9,#20
1424	add	v14.4s,v14.4s,v15.4s
1425	ror	w10,w10,#20
1426	add	v18.4s,v18.4s,v19.4s
1427	ror	w11,w11,#20
1428	add	v22.4s,v22.4s,v23.4s
1429	ror	w12,w12,#20
1430	eor	v24.16b,v1.16b,v2.16b
1431	add	w5,w5,w9
1432	eor	v25.16b,v5.16b,v6.16b
1433	add	w6,w6,w10
1434	eor	v26.16b,v9.16b,v10.16b
1435	add	w7,w7,w11
1436	eor	v27.16b,v13.16b,v14.16b
1437	add	w8,w8,w12
1438	eor	v28.16b,v17.16b,v18.16b
1439	eor	w17,w17,w5
1440	eor	v29.16b,v21.16b,v22.16b
1441	eor	w19,w19,w6
1442	ushr	v1.4s,v24.4s,#20
1443	eor	w20,w20,w7
1444	ushr	v5.4s,v25.4s,#20
1445	eor	w21,w21,w8
1446	ushr	v9.4s,v26.4s,#20
1447	ror	w17,w17,#24
1448	ushr	v13.4s,v27.4s,#20
1449	ror	w19,w19,#24
1450	ushr	v17.4s,v28.4s,#20
1451	ror	w20,w20,#24
1452	ushr	v21.4s,v29.4s,#20
1453	ror	w21,w21,#24
1454	sli	v1.4s,v24.4s,#12
1455	add	w13,w13,w17
1456	sli	v5.4s,v25.4s,#12
1457	add	w14,w14,w19
1458	sli	v9.4s,v26.4s,#12
1459	add	w15,w15,w20
1460	sli	v13.4s,v27.4s,#12
1461	add	w16,w16,w21
1462	sli	v17.4s,v28.4s,#12
1463	eor	w9,w9,w13
1464	sli	v21.4s,v29.4s,#12
1465	eor	w10,w10,w14
1466	add	v0.4s,v0.4s,v1.4s
1467	eor	w11,w11,w15
1468	add	v4.4s,v4.4s,v5.4s
1469	eor	w12,w12,w16
1470	add	v8.4s,v8.4s,v9.4s
1471	ror	w9,w9,#25
1472	add	v12.4s,v12.4s,v13.4s
1473	ror	w10,w10,#25
1474	add	v16.4s,v16.4s,v17.4s
1475	ror	w11,w11,#25
1476	add	v20.4s,v20.4s,v21.4s
1477	ror	w12,w12,#25
1478	eor	v24.16b,v3.16b,v0.16b
1479	add	w5,w5,w10
1480	eor	v25.16b,v7.16b,v4.16b
1481	add	w6,w6,w11
1482	eor	v26.16b,v11.16b,v8.16b
1483	add	w7,w7,w12
1484	eor	v27.16b,v15.16b,v12.16b
1485	add	w8,w8,w9
1486	eor	v28.16b,v19.16b,v16.16b
1487	eor	w21,w21,w5
1488	eor	v29.16b,v23.16b,v20.16b
1489	eor	w17,w17,w6
1490	ushr	v3.4s,v24.4s,#24
1491	eor	w19,w19,w7
1492	ushr	v7.4s,v25.4s,#24
1493	eor	w20,w20,w8
1494	ushr	v11.4s,v26.4s,#24
1495	ror	w21,w21,#16
1496	ushr	v15.4s,v27.4s,#24
1497	ror	w17,w17,#16
1498	ushr	v19.4s,v28.4s,#24
1499	ror	w19,w19,#16
1500	ushr	v23.4s,v29.4s,#24
1501	ror	w20,w20,#16
1502	sli	v3.4s,v24.4s,#8
1503	add	w15,w15,w21
1504	sli	v7.4s,v25.4s,#8
1505	add	w16,w16,w17
1506	sli	v11.4s,v26.4s,#8
1507	add	w13,w13,w19
1508	sli	v15.4s,v27.4s,#8
1509	add	w14,w14,w20
1510	sli	v19.4s,v28.4s,#8
1511	eor	w10,w10,w15
1512	sli	v23.4s,v29.4s,#8
1513	eor	w11,w11,w16
1514	add	v2.4s,v2.4s,v3.4s
1515	eor	w12,w12,w13
1516	add	v6.4s,v6.4s,v7.4s
1517	eor	w9,w9,w14
1518	add	v10.4s,v10.4s,v11.4s
1519	ror	w10,w10,#20
1520	add	v14.4s,v14.4s,v15.4s
1521	ror	w11,w11,#20
1522	add	v18.4s,v18.4s,v19.4s
1523	ror	w12,w12,#20
1524	add	v22.4s,v22.4s,v23.4s
1525	ror	w9,w9,#20
1526	eor	v24.16b,v1.16b,v2.16b
1527	add	w5,w5,w10
1528	eor	v25.16b,v5.16b,v6.16b
1529	add	w6,w6,w11
1530	eor	v26.16b,v9.16b,v10.16b
1531	add	w7,w7,w12
1532	eor	v27.16b,v13.16b,v14.16b
1533	add	w8,w8,w9
1534	eor	v28.16b,v17.16b,v18.16b
1535	eor	w21,w21,w5
1536	eor	v29.16b,v21.16b,v22.16b
1537	eor	w17,w17,w6
1538	ushr	v1.4s,v24.4s,#25
1539	eor	w19,w19,w7
1540	ushr	v5.4s,v25.4s,#25
1541	eor	w20,w20,w8
1542	ushr	v9.4s,v26.4s,#25
1543	ror	w21,w21,#24
1544	ushr	v13.4s,v27.4s,#25
1545	ror	w17,w17,#24
1546	ushr	v17.4s,v28.4s,#25
1547	ror	w19,w19,#24
1548	ushr	v21.4s,v29.4s,#25
1549	ror	w20,w20,#24
1550	sli	v1.4s,v24.4s,#7
1551	add	w15,w15,w21
1552	sli	v5.4s,v25.4s,#7
1553	add	w16,w16,w17
1554	sli	v9.4s,v26.4s,#7
1555	add	w13,w13,w19
1556	sli	v13.4s,v27.4s,#7
1557	add	w14,w14,w20
1558	sli	v17.4s,v28.4s,#7
1559	eor	w10,w10,w15
1560	sli	v21.4s,v29.4s,#7
1561	eor	w11,w11,w16
1562	ext	v2.16b,v2.16b,v2.16b,#8
1563	eor	w12,w12,w13
1564	ext	v6.16b,v6.16b,v6.16b,#8
1565	eor	w9,w9,w14
1566	ext	v10.16b,v10.16b,v10.16b,#8
1567	ror	w10,w10,#25
1568	ext	v14.16b,v14.16b,v14.16b,#8
1569	ror	w11,w11,#25
1570	ext	v18.16b,v18.16b,v18.16b,#8
1571	ror	w12,w12,#25
1572	ext	v22.16b,v22.16b,v22.16b,#8
1573	ror	w9,w9,#25
1574	ext	v3.16b,v3.16b,v3.16b,#12
1575	ext	v7.16b,v7.16b,v7.16b,#12
1576	ext	v11.16b,v11.16b,v11.16b,#12
1577	ext	v15.16b,v15.16b,v15.16b,#12
1578	ext	v19.16b,v19.16b,v19.16b,#12
1579	ext	v23.16b,v23.16b,v23.16b,#12
1580	ext	v1.16b,v1.16b,v1.16b,#4
1581	ext	v5.16b,v5.16b,v5.16b,#4
1582	ext	v9.16b,v9.16b,v9.16b,#4
1583	ext	v13.16b,v13.16b,v13.16b,#4
1584	ext	v17.16b,v17.16b,v17.16b,#4
1585	ext	v21.16b,v21.16b,v21.16b,#4
1586	add	v0.4s,v0.4s,v1.4s
1587	add	w5,w5,w9
1588	add	v4.4s,v4.4s,v5.4s
1589	add	w6,w6,w10
1590	add	v8.4s,v8.4s,v9.4s
1591	add	w7,w7,w11
1592	add	v12.4s,v12.4s,v13.4s
1593	add	w8,w8,w12
1594	add	v16.4s,v16.4s,v17.4s
1595	eor	w17,w17,w5
1596	add	v20.4s,v20.4s,v21.4s
1597	eor	w19,w19,w6
1598	eor	v3.16b,v3.16b,v0.16b
1599	eor	w20,w20,w7
1600	eor	v7.16b,v7.16b,v4.16b
1601	eor	w21,w21,w8
1602	eor	v11.16b,v11.16b,v8.16b
1603	ror	w17,w17,#16
1604	eor	v15.16b,v15.16b,v12.16b
1605	ror	w19,w19,#16
1606	eor	v19.16b,v19.16b,v16.16b
1607	ror	w20,w20,#16
1608	eor	v23.16b,v23.16b,v20.16b
1609	ror	w21,w21,#16
1610	rev32	v3.8h,v3.8h
1611	add	w13,w13,w17
1612	rev32	v7.8h,v7.8h
1613	add	w14,w14,w19
1614	rev32	v11.8h,v11.8h
1615	add	w15,w15,w20
1616	rev32	v15.8h,v15.8h
1617	add	w16,w16,w21
1618	rev32	v19.8h,v19.8h
1619	eor	w9,w9,w13
1620	rev32	v23.8h,v23.8h
1621	eor	w10,w10,w14
1622	add	v2.4s,v2.4s,v3.4s
1623	eor	w11,w11,w15
1624	add	v6.4s,v6.4s,v7.4s
1625	eor	w12,w12,w16
1626	add	v10.4s,v10.4s,v11.4s
1627	ror	w9,w9,#20
1628	add	v14.4s,v14.4s,v15.4s
1629	ror	w10,w10,#20
1630	add	v18.4s,v18.4s,v19.4s
1631	ror	w11,w11,#20
1632	add	v22.4s,v22.4s,v23.4s
1633	ror	w12,w12,#20
1634	eor	v24.16b,v1.16b,v2.16b
1635	add	w5,w5,w9
1636	eor	v25.16b,v5.16b,v6.16b
1637	add	w6,w6,w10
1638	eor	v26.16b,v9.16b,v10.16b
1639	add	w7,w7,w11
1640	eor	v27.16b,v13.16b,v14.16b
1641	add	w8,w8,w12
1642	eor	v28.16b,v17.16b,v18.16b
1643	eor	w17,w17,w5
1644	eor	v29.16b,v21.16b,v22.16b
1645	eor	w19,w19,w6
1646	ushr	v1.4s,v24.4s,#20
1647	eor	w20,w20,w7
1648	ushr	v5.4s,v25.4s,#20
1649	eor	w21,w21,w8
1650	ushr	v9.4s,v26.4s,#20
1651	ror	w17,w17,#24
1652	ushr	v13.4s,v27.4s,#20
1653	ror	w19,w19,#24
1654	ushr	v17.4s,v28.4s,#20
1655	ror	w20,w20,#24
1656	ushr	v21.4s,v29.4s,#20
1657	ror	w21,w21,#24
1658	sli	v1.4s,v24.4s,#12
1659	add	w13,w13,w17
1660	sli	v5.4s,v25.4s,#12
1661	add	w14,w14,w19
1662	sli	v9.4s,v26.4s,#12
1663	add	w15,w15,w20
1664	sli	v13.4s,v27.4s,#12
1665	add	w16,w16,w21
1666	sli	v17.4s,v28.4s,#12
1667	eor	w9,w9,w13
1668	sli	v21.4s,v29.4s,#12
1669	eor	w10,w10,w14
1670	add	v0.4s,v0.4s,v1.4s
1671	eor	w11,w11,w15
1672	add	v4.4s,v4.4s,v5.4s
1673	eor	w12,w12,w16
1674	add	v8.4s,v8.4s,v9.4s
1675	ror	w9,w9,#25
1676	add	v12.4s,v12.4s,v13.4s
1677	ror	w10,w10,#25
1678	add	v16.4s,v16.4s,v17.4s
1679	ror	w11,w11,#25
1680	add	v20.4s,v20.4s,v21.4s
1681	ror	w12,w12,#25
1682	eor	v24.16b,v3.16b,v0.16b
1683	add	w5,w5,w10
1684	eor	v25.16b,v7.16b,v4.16b
1685	add	w6,w6,w11
1686	eor	v26.16b,v11.16b,v8.16b
1687	add	w7,w7,w12
1688	eor	v27.16b,v15.16b,v12.16b
1689	add	w8,w8,w9
1690	eor	v28.16b,v19.16b,v16.16b
1691	eor	w21,w21,w5
1692	eor	v29.16b,v23.16b,v20.16b
1693	eor	w17,w17,w6
1694	ushr	v3.4s,v24.4s,#24
1695	eor	w19,w19,w7
1696	ushr	v7.4s,v25.4s,#24
1697	eor	w20,w20,w8
1698	ushr	v11.4s,v26.4s,#24
1699	ror	w21,w21,#16
1700	ushr	v15.4s,v27.4s,#24
1701	ror	w17,w17,#16
1702	ushr	v19.4s,v28.4s,#24
1703	ror	w19,w19,#16
1704	ushr	v23.4s,v29.4s,#24
1705	ror	w20,w20,#16
1706	sli	v3.4s,v24.4s,#8
1707	add	w15,w15,w21
1708	sli	v7.4s,v25.4s,#8
1709	add	w16,w16,w17
1710	sli	v11.4s,v26.4s,#8
1711	add	w13,w13,w19
1712	sli	v15.4s,v27.4s,#8
1713	add	w14,w14,w20
1714	sli	v19.4s,v28.4s,#8
1715	eor	w10,w10,w15
1716	sli	v23.4s,v29.4s,#8
1717	eor	w11,w11,w16
1718	add	v2.4s,v2.4s,v3.4s
1719	eor	w12,w12,w13
1720	add	v6.4s,v6.4s,v7.4s
1721	eor	w9,w9,w14
1722	add	v10.4s,v10.4s,v11.4s
1723	ror	w10,w10,#20
1724	add	v14.4s,v14.4s,v15.4s
1725	ror	w11,w11,#20
1726	add	v18.4s,v18.4s,v19.4s
1727	ror	w12,w12,#20
1728	add	v22.4s,v22.4s,v23.4s
1729	ror	w9,w9,#20
1730	eor	v24.16b,v1.16b,v2.16b
1731	add	w5,w5,w10
1732	eor	v25.16b,v5.16b,v6.16b
1733	add	w6,w6,w11
1734	eor	v26.16b,v9.16b,v10.16b
1735	add	w7,w7,w12
1736	eor	v27.16b,v13.16b,v14.16b
1737	add	w8,w8,w9
1738	eor	v28.16b,v17.16b,v18.16b
1739	eor	w21,w21,w5
1740	eor	v29.16b,v21.16b,v22.16b
1741	eor	w17,w17,w6
1742	ushr	v1.4s,v24.4s,#25
1743	eor	w19,w19,w7
1744	ushr	v5.4s,v25.4s,#25
1745	eor	w20,w20,w8
1746	ushr	v9.4s,v26.4s,#25
1747	ror	w21,w21,#24
1748	ushr	v13.4s,v27.4s,#25
1749	ror	w17,w17,#24
1750	ushr	v17.4s,v28.4s,#25
1751	ror	w19,w19,#24
1752	ushr	v21.4s,v29.4s,#25
1753	ror	w20,w20,#24
1754	sli	v1.4s,v24.4s,#7
1755	add	w15,w15,w21
1756	sli	v5.4s,v25.4s,#7
1757	add	w16,w16,w17
1758	sli	v9.4s,v26.4s,#7
1759	add	w13,w13,w19
1760	sli	v13.4s,v27.4s,#7
1761	add	w14,w14,w20
1762	sli	v17.4s,v28.4s,#7
1763	eor	w10,w10,w15
1764	sli	v21.4s,v29.4s,#7
1765	eor	w11,w11,w16
1766	ext	v2.16b,v2.16b,v2.16b,#8
1767	eor	w12,w12,w13
1768	ext	v6.16b,v6.16b,v6.16b,#8
1769	eor	w9,w9,w14
1770	ext	v10.16b,v10.16b,v10.16b,#8
1771	ror	w10,w10,#25
1772	ext	v14.16b,v14.16b,v14.16b,#8
1773	ror	w11,w11,#25
1774	ext	v18.16b,v18.16b,v18.16b,#8
1775	ror	w12,w12,#25
1776	ext	v22.16b,v22.16b,v22.16b,#8
1777	ror	w9,w9,#25
1778	ext	v3.16b,v3.16b,v3.16b,#4
1779	ext	v7.16b,v7.16b,v7.16b,#4
1780	ext	v11.16b,v11.16b,v11.16b,#4
1781	ext	v15.16b,v15.16b,v15.16b,#4
1782	ext	v19.16b,v19.16b,v19.16b,#4
1783	ext	v23.16b,v23.16b,v23.16b,#4
1784	ext	v1.16b,v1.16b,v1.16b,#12
1785	ext	v5.16b,v5.16b,v5.16b,#12
1786	ext	v9.16b,v9.16b,v9.16b,#12
1787	ext	v13.16b,v13.16b,v13.16b,#12
1788	ext	v17.16b,v17.16b,v17.16b,#12
1789	ext	v21.16b,v21.16b,v21.16b,#12
1790	cbnz	x4,.Loop_lower_neon
1791
1792	add	w5,w5,w22		// accumulate key block
1793	ldp	q24,q25,[sp,#0]
1794	add	x6,x6,x22,lsr#32
1795	ldp	q26,q27,[sp,#32]
1796	add	w7,w7,w23
1797	ldp	q28,q29,[sp,#64]
1798	add	x8,x8,x23,lsr#32
1799	add	v0.4s,v0.4s,v24.4s
1800	add	w9,w9,w24
1801	add	v4.4s,v4.4s,v24.4s
1802	add	x10,x10,x24,lsr#32
1803	add	v8.4s,v8.4s,v24.4s
1804	add	w11,w11,w25
1805	add	v12.4s,v12.4s,v24.4s
1806	add	x12,x12,x25,lsr#32
1807	add	v16.4s,v16.4s,v24.4s
1808	add	w13,w13,w26
1809	add	v20.4s,v20.4s,v24.4s
1810	add	x14,x14,x26,lsr#32
1811	add	v2.4s,v2.4s,v26.4s
1812	add	w15,w15,w27
1813	add	v6.4s,v6.4s,v26.4s
1814	add	x16,x16,x27,lsr#32
1815	add	v10.4s,v10.4s,v26.4s
1816	add	w17,w17,w28
1817	add	v14.4s,v14.4s,v26.4s
1818	add	x19,x19,x28,lsr#32
1819	add	v18.4s,v18.4s,v26.4s
1820	add	w20,w20,w30
1821	add	v22.4s,v22.4s,v26.4s
1822	add	x21,x21,x30,lsr#32
1823	add	v19.4s,v19.4s,v31.4s			// +4
1824	add	x5,x5,x6,lsl#32	// pack
1825	add	v23.4s,v23.4s,v31.4s			// +4
1826	add	x7,x7,x8,lsl#32
1827	add	v3.4s,v3.4s,v27.4s
1828	ldp	x6,x8,[x1,#0]		// load input
1829	add	v7.4s,v7.4s,v28.4s
1830	add	x9,x9,x10,lsl#32
1831	add	v11.4s,v11.4s,v29.4s
1832	add	x11,x11,x12,lsl#32
1833	add	v15.4s,v15.4s,v30.4s
1834	ldp	x10,x12,[x1,#16]
1835	add	v19.4s,v19.4s,v27.4s
1836	add	x13,x13,x14,lsl#32
1837	add	v23.4s,v23.4s,v28.4s
1838	add	x15,x15,x16,lsl#32
1839	add	v1.4s,v1.4s,v25.4s
1840	ldp	x14,x16,[x1,#32]
1841	add	v5.4s,v5.4s,v25.4s
1842	add	x17,x17,x19,lsl#32
1843	add	v9.4s,v9.4s,v25.4s
1844	add	x20,x20,x21,lsl#32
1845	add	v13.4s,v13.4s,v25.4s
1846	ldp	x19,x21,[x1,#48]
1847	add	v17.4s,v17.4s,v25.4s
1848	add	x1,x1,#64
1849	add	v21.4s,v21.4s,v25.4s
1850
1851#ifdef	__AARCH64EB__
1852	rev	x5,x5
1853	rev	x7,x7
1854	rev	x9,x9
1855	rev	x11,x11
1856	rev	x13,x13
1857	rev	x15,x15
1858	rev	x17,x17
1859	rev	x20,x20
1860#endif
1861	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1862	eor	x5,x5,x6
1863	eor	x7,x7,x8
1864	eor	x9,x9,x10
1865	eor	x11,x11,x12
1866	eor	x13,x13,x14
1867	eor	v0.16b,v0.16b,v24.16b
1868	eor	x15,x15,x16
1869	eor	v1.16b,v1.16b,v25.16b
1870	eor	x17,x17,x19
1871	eor	v2.16b,v2.16b,v26.16b
1872	eor	x20,x20,x21
1873	eor	v3.16b,v3.16b,v27.16b
1874	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1875
1876	stp	x5,x7,[x0,#0]		// store output
1877	add	x28,x28,#7			// increment counter
1878	stp	x9,x11,[x0,#16]
1879	stp	x13,x15,[x0,#32]
1880	stp	x17,x20,[x0,#48]
1881	add	x0,x0,#64
1882	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1883
1884	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1885	eor	v4.16b,v4.16b,v24.16b
1886	eor	v5.16b,v5.16b,v25.16b
1887	eor	v6.16b,v6.16b,v26.16b
1888	eor	v7.16b,v7.16b,v27.16b
1889	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1890
1891	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1892	eor	v8.16b,v8.16b,v0.16b
1893	ldp	q24,q25,[sp,#0]
1894	eor	v9.16b,v9.16b,v1.16b
1895	ldp	q26,q27,[sp,#32]
1896	eor	v10.16b,v10.16b,v2.16b
1897	eor	v11.16b,v11.16b,v3.16b
1898	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1899
1900	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1901	eor	v12.16b,v12.16b,v4.16b
1902	eor	v13.16b,v13.16b,v5.16b
1903	eor	v14.16b,v14.16b,v6.16b
1904	eor	v15.16b,v15.16b,v7.16b
1905	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1906
1907	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1908	eor	v16.16b,v16.16b,v8.16b
1909	eor	v17.16b,v17.16b,v9.16b
1910	eor	v18.16b,v18.16b,v10.16b
1911	eor	v19.16b,v19.16b,v11.16b
1912	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1913
1914	shl	v0.4s,v31.4s,#1			// 4 -> 8
1915	eor	v20.16b,v20.16b,v12.16b
1916	eor	v21.16b,v21.16b,v13.16b
1917	eor	v22.16b,v22.16b,v14.16b
1918	eor	v23.16b,v23.16b,v15.16b
1919	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1920
1921	add	v27.4s,v27.4s,v0.4s			// += 8
1922	add	v28.4s,v28.4s,v0.4s
1923	add	v29.4s,v29.4s,v0.4s
1924	add	v30.4s,v30.4s,v0.4s
1925
1926	b.hs	.Loop_outer_512_neon
1927
1928	adds	x2,x2,#512
1929	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1930
1931	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1932	ldp	d10,d11,[sp,#128+16]
1933	ldp	d12,d13,[sp,#128+32]
1934	ldp	d14,d15,[sp,#128+48]
1935
1936	stp	q24,q31,[sp,#0]		// wipe off-load area
1937	stp	q24,q31,[sp,#32]
1938	stp	q24,q31,[sp,#64]
1939
1940	b.eq	.Ldone_512_neon
1941
1942	cmp	x2,#192
1943	sub	v27.4s,v27.4s,v0.4s			// -= 1
1944	sub	v28.4s,v28.4s,v0.4s
1945	sub	v29.4s,v29.4s,v0.4s
1946	add	sp,sp,#128
1947	b.hs	.Loop_outer_neon
1948
1949	eor	v25.16b,v25.16b,v25.16b
1950	eor	v26.16b,v26.16b,v26.16b
1951	eor	v27.16b,v27.16b,v27.16b
1952	eor	v28.16b,v28.16b,v28.16b
1953	eor	v29.16b,v29.16b,v29.16b
1954	eor	v30.16b,v30.16b,v30.16b
1955	b	.Loop_outer
1956
1957.Ldone_512_neon:
1958	ldp	x19,x20,[x29,#16]
1959	add	sp,sp,#128+64
1960	ldp	x21,x22,[x29,#32]
1961	ldp	x23,x24,[x29,#48]
1962	ldp	x25,x26,[x29,#64]
1963	ldp	x27,x28,[x29,#80]
1964	ldp	x29,x30,[sp],#96
1965	AARCH64_VALIDATE_LINK_REGISTER
1966	ret
1967.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1968#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1969