1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18
19.hidden	OPENSSL_armcap_P
20
21.section	.rodata
22
23.align	5
24.Lsigma:
25.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
26.Lone:
27.long	1,0,0,0
28.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
29.align	2
30
31.text
32
33.globl	ChaCha20_ctr32
34.hidden	ChaCha20_ctr32
35.type	ChaCha20_ctr32,%function
36.align	5
37ChaCha20_ctr32:
38	AARCH64_VALID_CALL_TARGET
39	cbz	x2,.Labort
40#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
41	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
42#else
43	adrp	x5,OPENSSL_armcap_P
44#endif
45	cmp	x2,#192
46	b.lo	.Lshort
47	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
48	tst	w17,#ARMV7_NEON
49	b.ne	ChaCha20_neon
50
51.Lshort:
52	AARCH64_SIGN_LINK_REGISTER
53	stp	x29,x30,[sp,#-96]!
54	add	x29,sp,#0
55
56	adrp	x5,.Lsigma
57	add	x5,x5,:lo12:.Lsigma
58	stp	x19,x20,[sp,#16]
59	stp	x21,x22,[sp,#32]
60	stp	x23,x24,[sp,#48]
61	stp	x25,x26,[sp,#64]
62	stp	x27,x28,[sp,#80]
63	sub	sp,sp,#64
64
65	ldp	x22,x23,[x5]		// load sigma
66	ldp	x24,x25,[x3]		// load key
67	ldp	x26,x27,[x3,#16]
68	ldp	x28,x30,[x4]		// load counter
69#ifdef	__ARMEB__
70	ror	x24,x24,#32
71	ror	x25,x25,#32
72	ror	x26,x26,#32
73	ror	x27,x27,#32
74	ror	x28,x28,#32
75	ror	x30,x30,#32
76#endif
77
78.Loop_outer:
79	mov	w5,w22			// unpack key block
80	lsr	x6,x22,#32
81	mov	w7,w23
82	lsr	x8,x23,#32
83	mov	w9,w24
84	lsr	x10,x24,#32
85	mov	w11,w25
86	lsr	x12,x25,#32
87	mov	w13,w26
88	lsr	x14,x26,#32
89	mov	w15,w27
90	lsr	x16,x27,#32
91	mov	w17,w28
92	lsr	x19,x28,#32
93	mov	w20,w30
94	lsr	x21,x30,#32
95
96	mov	x4,#10
97	subs	x2,x2,#64
98.Loop:
99	sub	x4,x4,#1
100	add	w5,w5,w9
101	add	w6,w6,w10
102	add	w7,w7,w11
103	add	w8,w8,w12
104	eor	w17,w17,w5
105	eor	w19,w19,w6
106	eor	w20,w20,w7
107	eor	w21,w21,w8
108	ror	w17,w17,#16
109	ror	w19,w19,#16
110	ror	w20,w20,#16
111	ror	w21,w21,#16
112	add	w13,w13,w17
113	add	w14,w14,w19
114	add	w15,w15,w20
115	add	w16,w16,w21
116	eor	w9,w9,w13
117	eor	w10,w10,w14
118	eor	w11,w11,w15
119	eor	w12,w12,w16
120	ror	w9,w9,#20
121	ror	w10,w10,#20
122	ror	w11,w11,#20
123	ror	w12,w12,#20
124	add	w5,w5,w9
125	add	w6,w6,w10
126	add	w7,w7,w11
127	add	w8,w8,w12
128	eor	w17,w17,w5
129	eor	w19,w19,w6
130	eor	w20,w20,w7
131	eor	w21,w21,w8
132	ror	w17,w17,#24
133	ror	w19,w19,#24
134	ror	w20,w20,#24
135	ror	w21,w21,#24
136	add	w13,w13,w17
137	add	w14,w14,w19
138	add	w15,w15,w20
139	add	w16,w16,w21
140	eor	w9,w9,w13
141	eor	w10,w10,w14
142	eor	w11,w11,w15
143	eor	w12,w12,w16
144	ror	w9,w9,#25
145	ror	w10,w10,#25
146	ror	w11,w11,#25
147	ror	w12,w12,#25
148	add	w5,w5,w10
149	add	w6,w6,w11
150	add	w7,w7,w12
151	add	w8,w8,w9
152	eor	w21,w21,w5
153	eor	w17,w17,w6
154	eor	w19,w19,w7
155	eor	w20,w20,w8
156	ror	w21,w21,#16
157	ror	w17,w17,#16
158	ror	w19,w19,#16
159	ror	w20,w20,#16
160	add	w15,w15,w21
161	add	w16,w16,w17
162	add	w13,w13,w19
163	add	w14,w14,w20
164	eor	w10,w10,w15
165	eor	w11,w11,w16
166	eor	w12,w12,w13
167	eor	w9,w9,w14
168	ror	w10,w10,#20
169	ror	w11,w11,#20
170	ror	w12,w12,#20
171	ror	w9,w9,#20
172	add	w5,w5,w10
173	add	w6,w6,w11
174	add	w7,w7,w12
175	add	w8,w8,w9
176	eor	w21,w21,w5
177	eor	w17,w17,w6
178	eor	w19,w19,w7
179	eor	w20,w20,w8
180	ror	w21,w21,#24
181	ror	w17,w17,#24
182	ror	w19,w19,#24
183	ror	w20,w20,#24
184	add	w15,w15,w21
185	add	w16,w16,w17
186	add	w13,w13,w19
187	add	w14,w14,w20
188	eor	w10,w10,w15
189	eor	w11,w11,w16
190	eor	w12,w12,w13
191	eor	w9,w9,w14
192	ror	w10,w10,#25
193	ror	w11,w11,#25
194	ror	w12,w12,#25
195	ror	w9,w9,#25
196	cbnz	x4,.Loop
197
198	add	w5,w5,w22		// accumulate key block
199	add	x6,x6,x22,lsr#32
200	add	w7,w7,w23
201	add	x8,x8,x23,lsr#32
202	add	w9,w9,w24
203	add	x10,x10,x24,lsr#32
204	add	w11,w11,w25
205	add	x12,x12,x25,lsr#32
206	add	w13,w13,w26
207	add	x14,x14,x26,lsr#32
208	add	w15,w15,w27
209	add	x16,x16,x27,lsr#32
210	add	w17,w17,w28
211	add	x19,x19,x28,lsr#32
212	add	w20,w20,w30
213	add	x21,x21,x30,lsr#32
214
215	b.lo	.Ltail
216
217	add	x5,x5,x6,lsl#32	// pack
218	add	x7,x7,x8,lsl#32
219	ldp	x6,x8,[x1,#0]		// load input
220	add	x9,x9,x10,lsl#32
221	add	x11,x11,x12,lsl#32
222	ldp	x10,x12,[x1,#16]
223	add	x13,x13,x14,lsl#32
224	add	x15,x15,x16,lsl#32
225	ldp	x14,x16,[x1,#32]
226	add	x17,x17,x19,lsl#32
227	add	x20,x20,x21,lsl#32
228	ldp	x19,x21,[x1,#48]
229	add	x1,x1,#64
230#ifdef	__ARMEB__
231	rev	x5,x5
232	rev	x7,x7
233	rev	x9,x9
234	rev	x11,x11
235	rev	x13,x13
236	rev	x15,x15
237	rev	x17,x17
238	rev	x20,x20
239#endif
240	eor	x5,x5,x6
241	eor	x7,x7,x8
242	eor	x9,x9,x10
243	eor	x11,x11,x12
244	eor	x13,x13,x14
245	eor	x15,x15,x16
246	eor	x17,x17,x19
247	eor	x20,x20,x21
248
249	stp	x5,x7,[x0,#0]		// store output
250	add	x28,x28,#1			// increment counter
251	stp	x9,x11,[x0,#16]
252	stp	x13,x15,[x0,#32]
253	stp	x17,x20,[x0,#48]
254	add	x0,x0,#64
255
256	b.hi	.Loop_outer
257
258	ldp	x19,x20,[x29,#16]
259	add	sp,sp,#64
260	ldp	x21,x22,[x29,#32]
261	ldp	x23,x24,[x29,#48]
262	ldp	x25,x26,[x29,#64]
263	ldp	x27,x28,[x29,#80]
264	ldp	x29,x30,[sp],#96
265	AARCH64_VALIDATE_LINK_REGISTER
266.Labort:
267	ret
268
269.align	4
270.Ltail:
271	add	x2,x2,#64
272.Less_than_64:
273	sub	x0,x0,#1
274	add	x1,x1,x2
275	add	x0,x0,x2
276	add	x4,sp,x2
277	neg	x2,x2
278
279	add	x5,x5,x6,lsl#32	// pack
280	add	x7,x7,x8,lsl#32
281	add	x9,x9,x10,lsl#32
282	add	x11,x11,x12,lsl#32
283	add	x13,x13,x14,lsl#32
284	add	x15,x15,x16,lsl#32
285	add	x17,x17,x19,lsl#32
286	add	x20,x20,x21,lsl#32
287#ifdef	__ARMEB__
288	rev	x5,x5
289	rev	x7,x7
290	rev	x9,x9
291	rev	x11,x11
292	rev	x13,x13
293	rev	x15,x15
294	rev	x17,x17
295	rev	x20,x20
296#endif
297	stp	x5,x7,[sp,#0]
298	stp	x9,x11,[sp,#16]
299	stp	x13,x15,[sp,#32]
300	stp	x17,x20,[sp,#48]
301
302.Loop_tail:
303	ldrb	w10,[x1,x2]
304	ldrb	w11,[x4,x2]
305	add	x2,x2,#1
306	eor	w10,w10,w11
307	strb	w10,[x0,x2]
308	cbnz	x2,.Loop_tail
309
310	stp	xzr,xzr,[sp,#0]
311	stp	xzr,xzr,[sp,#16]
312	stp	xzr,xzr,[sp,#32]
313	stp	xzr,xzr,[sp,#48]
314
315	ldp	x19,x20,[x29,#16]
316	add	sp,sp,#64
317	ldp	x21,x22,[x29,#32]
318	ldp	x23,x24,[x29,#48]
319	ldp	x25,x26,[x29,#64]
320	ldp	x27,x28,[x29,#80]
321	ldp	x29,x30,[sp],#96
322	AARCH64_VALIDATE_LINK_REGISTER
323	ret
324.size	ChaCha20_ctr32,.-ChaCha20_ctr32
325
326.type	ChaCha20_neon,%function
327.align	5
328ChaCha20_neon:
329	AARCH64_SIGN_LINK_REGISTER
330	stp	x29,x30,[sp,#-96]!
331	add	x29,sp,#0
332
333	adrp	x5,.Lsigma
334	add	x5,x5,:lo12:.Lsigma
335	stp	x19,x20,[sp,#16]
336	stp	x21,x22,[sp,#32]
337	stp	x23,x24,[sp,#48]
338	stp	x25,x26,[sp,#64]
339	stp	x27,x28,[sp,#80]
340	cmp	x2,#512
341	b.hs	.L512_or_more_neon
342
343	sub	sp,sp,#64
344
345	ldp	x22,x23,[x5]		// load sigma
346	ld1	{v24.4s},[x5],#16
347	ldp	x24,x25,[x3]		// load key
348	ldp	x26,x27,[x3,#16]
349	ld1	{v25.4s,v26.4s},[x3]
350	ldp	x28,x30,[x4]		// load counter
351	ld1	{v27.4s},[x4]
352	ld1	{v31.4s},[x5]
353#ifdef	__ARMEB__
354	rev64	v24.4s,v24.4s
355	ror	x24,x24,#32
356	ror	x25,x25,#32
357	ror	x26,x26,#32
358	ror	x27,x27,#32
359	ror	x28,x28,#32
360	ror	x30,x30,#32
361#endif
362	add	v27.4s,v27.4s,v31.4s		// += 1
363	add	v28.4s,v27.4s,v31.4s
364	add	v29.4s,v28.4s,v31.4s
365	shl	v31.4s,v31.4s,#2			// 1 -> 4
366
367.Loop_outer_neon:
368	mov	w5,w22			// unpack key block
369	lsr	x6,x22,#32
370	mov	v0.16b,v24.16b
371	mov	w7,w23
372	lsr	x8,x23,#32
373	mov	v4.16b,v24.16b
374	mov	w9,w24
375	lsr	x10,x24,#32
376	mov	v16.16b,v24.16b
377	mov	w11,w25
378	mov	v1.16b,v25.16b
379	lsr	x12,x25,#32
380	mov	v5.16b,v25.16b
381	mov	w13,w26
382	mov	v17.16b,v25.16b
383	lsr	x14,x26,#32
384	mov	v3.16b,v27.16b
385	mov	w15,w27
386	mov	v7.16b,v28.16b
387	lsr	x16,x27,#32
388	mov	v19.16b,v29.16b
389	mov	w17,w28
390	mov	v2.16b,v26.16b
391	lsr	x19,x28,#32
392	mov	v6.16b,v26.16b
393	mov	w20,w30
394	mov	v18.16b,v26.16b
395	lsr	x21,x30,#32
396
397	mov	x4,#10
398	subs	x2,x2,#256
399.Loop_neon:
400	sub	x4,x4,#1
401	add	v0.4s,v0.4s,v1.4s
402	add	w5,w5,w9
403	add	v4.4s,v4.4s,v5.4s
404	add	w6,w6,w10
405	add	v16.4s,v16.4s,v17.4s
406	add	w7,w7,w11
407	eor	v3.16b,v3.16b,v0.16b
408	add	w8,w8,w12
409	eor	v7.16b,v7.16b,v4.16b
410	eor	w17,w17,w5
411	eor	v19.16b,v19.16b,v16.16b
412	eor	w19,w19,w6
413	rev32	v3.8h,v3.8h
414	eor	w20,w20,w7
415	rev32	v7.8h,v7.8h
416	eor	w21,w21,w8
417	rev32	v19.8h,v19.8h
418	ror	w17,w17,#16
419	add	v2.4s,v2.4s,v3.4s
420	ror	w19,w19,#16
421	add	v6.4s,v6.4s,v7.4s
422	ror	w20,w20,#16
423	add	v18.4s,v18.4s,v19.4s
424	ror	w21,w21,#16
425	eor	v20.16b,v1.16b,v2.16b
426	add	w13,w13,w17
427	eor	v21.16b,v5.16b,v6.16b
428	add	w14,w14,w19
429	eor	v22.16b,v17.16b,v18.16b
430	add	w15,w15,w20
431	ushr	v1.4s,v20.4s,#20
432	add	w16,w16,w21
433	ushr	v5.4s,v21.4s,#20
434	eor	w9,w9,w13
435	ushr	v17.4s,v22.4s,#20
436	eor	w10,w10,w14
437	sli	v1.4s,v20.4s,#12
438	eor	w11,w11,w15
439	sli	v5.4s,v21.4s,#12
440	eor	w12,w12,w16
441	sli	v17.4s,v22.4s,#12
442	ror	w9,w9,#20
443	add	v0.4s,v0.4s,v1.4s
444	ror	w10,w10,#20
445	add	v4.4s,v4.4s,v5.4s
446	ror	w11,w11,#20
447	add	v16.4s,v16.4s,v17.4s
448	ror	w12,w12,#20
449	eor	v20.16b,v3.16b,v0.16b
450	add	w5,w5,w9
451	eor	v21.16b,v7.16b,v4.16b
452	add	w6,w6,w10
453	eor	v22.16b,v19.16b,v16.16b
454	add	w7,w7,w11
455	ushr	v3.4s,v20.4s,#24
456	add	w8,w8,w12
457	ushr	v7.4s,v21.4s,#24
458	eor	w17,w17,w5
459	ushr	v19.4s,v22.4s,#24
460	eor	w19,w19,w6
461	sli	v3.4s,v20.4s,#8
462	eor	w20,w20,w7
463	sli	v7.4s,v21.4s,#8
464	eor	w21,w21,w8
465	sli	v19.4s,v22.4s,#8
466	ror	w17,w17,#24
467	add	v2.4s,v2.4s,v3.4s
468	ror	w19,w19,#24
469	add	v6.4s,v6.4s,v7.4s
470	ror	w20,w20,#24
471	add	v18.4s,v18.4s,v19.4s
472	ror	w21,w21,#24
473	eor	v20.16b,v1.16b,v2.16b
474	add	w13,w13,w17
475	eor	v21.16b,v5.16b,v6.16b
476	add	w14,w14,w19
477	eor	v22.16b,v17.16b,v18.16b
478	add	w15,w15,w20
479	ushr	v1.4s,v20.4s,#25
480	add	w16,w16,w21
481	ushr	v5.4s,v21.4s,#25
482	eor	w9,w9,w13
483	ushr	v17.4s,v22.4s,#25
484	eor	w10,w10,w14
485	sli	v1.4s,v20.4s,#7
486	eor	w11,w11,w15
487	sli	v5.4s,v21.4s,#7
488	eor	w12,w12,w16
489	sli	v17.4s,v22.4s,#7
490	ror	w9,w9,#25
491	ext	v2.16b,v2.16b,v2.16b,#8
492	ror	w10,w10,#25
493	ext	v6.16b,v6.16b,v6.16b,#8
494	ror	w11,w11,#25
495	ext	v18.16b,v18.16b,v18.16b,#8
496	ror	w12,w12,#25
497	ext	v3.16b,v3.16b,v3.16b,#12
498	ext	v7.16b,v7.16b,v7.16b,#12
499	ext	v19.16b,v19.16b,v19.16b,#12
500	ext	v1.16b,v1.16b,v1.16b,#4
501	ext	v5.16b,v5.16b,v5.16b,#4
502	ext	v17.16b,v17.16b,v17.16b,#4
503	add	v0.4s,v0.4s,v1.4s
504	add	w5,w5,w10
505	add	v4.4s,v4.4s,v5.4s
506	add	w6,w6,w11
507	add	v16.4s,v16.4s,v17.4s
508	add	w7,w7,w12
509	eor	v3.16b,v3.16b,v0.16b
510	add	w8,w8,w9
511	eor	v7.16b,v7.16b,v4.16b
512	eor	w21,w21,w5
513	eor	v19.16b,v19.16b,v16.16b
514	eor	w17,w17,w6
515	rev32	v3.8h,v3.8h
516	eor	w19,w19,w7
517	rev32	v7.8h,v7.8h
518	eor	w20,w20,w8
519	rev32	v19.8h,v19.8h
520	ror	w21,w21,#16
521	add	v2.4s,v2.4s,v3.4s
522	ror	w17,w17,#16
523	add	v6.4s,v6.4s,v7.4s
524	ror	w19,w19,#16
525	add	v18.4s,v18.4s,v19.4s
526	ror	w20,w20,#16
527	eor	v20.16b,v1.16b,v2.16b
528	add	w15,w15,w21
529	eor	v21.16b,v5.16b,v6.16b
530	add	w16,w16,w17
531	eor	v22.16b,v17.16b,v18.16b
532	add	w13,w13,w19
533	ushr	v1.4s,v20.4s,#20
534	add	w14,w14,w20
535	ushr	v5.4s,v21.4s,#20
536	eor	w10,w10,w15
537	ushr	v17.4s,v22.4s,#20
538	eor	w11,w11,w16
539	sli	v1.4s,v20.4s,#12
540	eor	w12,w12,w13
541	sli	v5.4s,v21.4s,#12
542	eor	w9,w9,w14
543	sli	v17.4s,v22.4s,#12
544	ror	w10,w10,#20
545	add	v0.4s,v0.4s,v1.4s
546	ror	w11,w11,#20
547	add	v4.4s,v4.4s,v5.4s
548	ror	w12,w12,#20
549	add	v16.4s,v16.4s,v17.4s
550	ror	w9,w9,#20
551	eor	v20.16b,v3.16b,v0.16b
552	add	w5,w5,w10
553	eor	v21.16b,v7.16b,v4.16b
554	add	w6,w6,w11
555	eor	v22.16b,v19.16b,v16.16b
556	add	w7,w7,w12
557	ushr	v3.4s,v20.4s,#24
558	add	w8,w8,w9
559	ushr	v7.4s,v21.4s,#24
560	eor	w21,w21,w5
561	ushr	v19.4s,v22.4s,#24
562	eor	w17,w17,w6
563	sli	v3.4s,v20.4s,#8
564	eor	w19,w19,w7
565	sli	v7.4s,v21.4s,#8
566	eor	w20,w20,w8
567	sli	v19.4s,v22.4s,#8
568	ror	w21,w21,#24
569	add	v2.4s,v2.4s,v3.4s
570	ror	w17,w17,#24
571	add	v6.4s,v6.4s,v7.4s
572	ror	w19,w19,#24
573	add	v18.4s,v18.4s,v19.4s
574	ror	w20,w20,#24
575	eor	v20.16b,v1.16b,v2.16b
576	add	w15,w15,w21
577	eor	v21.16b,v5.16b,v6.16b
578	add	w16,w16,w17
579	eor	v22.16b,v17.16b,v18.16b
580	add	w13,w13,w19
581	ushr	v1.4s,v20.4s,#25
582	add	w14,w14,w20
583	ushr	v5.4s,v21.4s,#25
584	eor	w10,w10,w15
585	ushr	v17.4s,v22.4s,#25
586	eor	w11,w11,w16
587	sli	v1.4s,v20.4s,#7
588	eor	w12,w12,w13
589	sli	v5.4s,v21.4s,#7
590	eor	w9,w9,w14
591	sli	v17.4s,v22.4s,#7
592	ror	w10,w10,#25
593	ext	v2.16b,v2.16b,v2.16b,#8
594	ror	w11,w11,#25
595	ext	v6.16b,v6.16b,v6.16b,#8
596	ror	w12,w12,#25
597	ext	v18.16b,v18.16b,v18.16b,#8
598	ror	w9,w9,#25
599	ext	v3.16b,v3.16b,v3.16b,#4
600	ext	v7.16b,v7.16b,v7.16b,#4
601	ext	v19.16b,v19.16b,v19.16b,#4
602	ext	v1.16b,v1.16b,v1.16b,#12
603	ext	v5.16b,v5.16b,v5.16b,#12
604	ext	v17.16b,v17.16b,v17.16b,#12
605	cbnz	x4,.Loop_neon
606
607	add	w5,w5,w22		// accumulate key block
608	add	v0.4s,v0.4s,v24.4s
609	add	x6,x6,x22,lsr#32
610	add	v4.4s,v4.4s,v24.4s
611	add	w7,w7,w23
612	add	v16.4s,v16.4s,v24.4s
613	add	x8,x8,x23,lsr#32
614	add	v2.4s,v2.4s,v26.4s
615	add	w9,w9,w24
616	add	v6.4s,v6.4s,v26.4s
617	add	x10,x10,x24,lsr#32
618	add	v18.4s,v18.4s,v26.4s
619	add	w11,w11,w25
620	add	v3.4s,v3.4s,v27.4s
621	add	x12,x12,x25,lsr#32
622	add	w13,w13,w26
623	add	v7.4s,v7.4s,v28.4s
624	add	x14,x14,x26,lsr#32
625	add	w15,w15,w27
626	add	v19.4s,v19.4s,v29.4s
627	add	x16,x16,x27,lsr#32
628	add	w17,w17,w28
629	add	v1.4s,v1.4s,v25.4s
630	add	x19,x19,x28,lsr#32
631	add	w20,w20,w30
632	add	v5.4s,v5.4s,v25.4s
633	add	x21,x21,x30,lsr#32
634	add	v17.4s,v17.4s,v25.4s
635
636	b.lo	.Ltail_neon
637
638	add	x5,x5,x6,lsl#32	// pack
639	add	x7,x7,x8,lsl#32
640	ldp	x6,x8,[x1,#0]		// load input
641	add	x9,x9,x10,lsl#32
642	add	x11,x11,x12,lsl#32
643	ldp	x10,x12,[x1,#16]
644	add	x13,x13,x14,lsl#32
645	add	x15,x15,x16,lsl#32
646	ldp	x14,x16,[x1,#32]
647	add	x17,x17,x19,lsl#32
648	add	x20,x20,x21,lsl#32
649	ldp	x19,x21,[x1,#48]
650	add	x1,x1,#64
651#ifdef	__ARMEB__
652	rev	x5,x5
653	rev	x7,x7
654	rev	x9,x9
655	rev	x11,x11
656	rev	x13,x13
657	rev	x15,x15
658	rev	x17,x17
659	rev	x20,x20
660#endif
661	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
662	eor	x5,x5,x6
663	eor	x7,x7,x8
664	eor	x9,x9,x10
665	eor	x11,x11,x12
666	eor	x13,x13,x14
667	eor	v0.16b,v0.16b,v20.16b
668	eor	x15,x15,x16
669	eor	v1.16b,v1.16b,v21.16b
670	eor	x17,x17,x19
671	eor	v2.16b,v2.16b,v22.16b
672	eor	x20,x20,x21
673	eor	v3.16b,v3.16b,v23.16b
674	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
675
676	stp	x5,x7,[x0,#0]		// store output
677	add	x28,x28,#4			// increment counter
678	stp	x9,x11,[x0,#16]
679	add	v27.4s,v27.4s,v31.4s		// += 4
680	stp	x13,x15,[x0,#32]
681	add	v28.4s,v28.4s,v31.4s
682	stp	x17,x20,[x0,#48]
683	add	v29.4s,v29.4s,v31.4s
684	add	x0,x0,#64
685
686	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
687	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
688
689	eor	v4.16b,v4.16b,v20.16b
690	eor	v5.16b,v5.16b,v21.16b
691	eor	v6.16b,v6.16b,v22.16b
692	eor	v7.16b,v7.16b,v23.16b
693	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
694
695	eor	v16.16b,v16.16b,v0.16b
696	eor	v17.16b,v17.16b,v1.16b
697	eor	v18.16b,v18.16b,v2.16b
698	eor	v19.16b,v19.16b,v3.16b
699	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
700
701	b.hi	.Loop_outer_neon
702
703	ldp	x19,x20,[x29,#16]
704	add	sp,sp,#64
705	ldp	x21,x22,[x29,#32]
706	ldp	x23,x24,[x29,#48]
707	ldp	x25,x26,[x29,#64]
708	ldp	x27,x28,[x29,#80]
709	ldp	x29,x30,[sp],#96
710	AARCH64_VALIDATE_LINK_REGISTER
711	ret
712
713.Ltail_neon:
714	add	x2,x2,#256
715	cmp	x2,#64
716	b.lo	.Less_than_64
717
718	add	x5,x5,x6,lsl#32	// pack
719	add	x7,x7,x8,lsl#32
720	ldp	x6,x8,[x1,#0]		// load input
721	add	x9,x9,x10,lsl#32
722	add	x11,x11,x12,lsl#32
723	ldp	x10,x12,[x1,#16]
724	add	x13,x13,x14,lsl#32
725	add	x15,x15,x16,lsl#32
726	ldp	x14,x16,[x1,#32]
727	add	x17,x17,x19,lsl#32
728	add	x20,x20,x21,lsl#32
729	ldp	x19,x21,[x1,#48]
730	add	x1,x1,#64
731#ifdef	__ARMEB__
732	rev	x5,x5
733	rev	x7,x7
734	rev	x9,x9
735	rev	x11,x11
736	rev	x13,x13
737	rev	x15,x15
738	rev	x17,x17
739	rev	x20,x20
740#endif
741	eor	x5,x5,x6
742	eor	x7,x7,x8
743	eor	x9,x9,x10
744	eor	x11,x11,x12
745	eor	x13,x13,x14
746	eor	x15,x15,x16
747	eor	x17,x17,x19
748	eor	x20,x20,x21
749
750	stp	x5,x7,[x0,#0]		// store output
751	add	x28,x28,#4			// increment counter
752	stp	x9,x11,[x0,#16]
753	stp	x13,x15,[x0,#32]
754	stp	x17,x20,[x0,#48]
755	add	x0,x0,#64
756	b.eq	.Ldone_neon
757	sub	x2,x2,#64
758	cmp	x2,#64
759	b.lo	.Less_than_128
760
761	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
762	eor	v0.16b,v0.16b,v20.16b
763	eor	v1.16b,v1.16b,v21.16b
764	eor	v2.16b,v2.16b,v22.16b
765	eor	v3.16b,v3.16b,v23.16b
766	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
767	b.eq	.Ldone_neon
768	sub	x2,x2,#64
769	cmp	x2,#64
770	b.lo	.Less_than_192
771
772	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
773	eor	v4.16b,v4.16b,v20.16b
774	eor	v5.16b,v5.16b,v21.16b
775	eor	v6.16b,v6.16b,v22.16b
776	eor	v7.16b,v7.16b,v23.16b
777	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
778	b.eq	.Ldone_neon
779	sub	x2,x2,#64
780
781	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
782	b	.Last_neon
783
784.Less_than_128:
785	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
786	b	.Last_neon
787.Less_than_192:
788	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
789	b	.Last_neon
790
791.align	4
792.Last_neon:
793	sub	x0,x0,#1
794	add	x1,x1,x2
795	add	x0,x0,x2
796	add	x4,sp,x2
797	neg	x2,x2
798
799.Loop_tail_neon:
800	ldrb	w10,[x1,x2]
801	ldrb	w11,[x4,x2]
802	add	x2,x2,#1
803	eor	w10,w10,w11
804	strb	w10,[x0,x2]
805	cbnz	x2,.Loop_tail_neon
806
807	stp	xzr,xzr,[sp,#0]
808	stp	xzr,xzr,[sp,#16]
809	stp	xzr,xzr,[sp,#32]
810	stp	xzr,xzr,[sp,#48]
811
812.Ldone_neon:
813	ldp	x19,x20,[x29,#16]
814	add	sp,sp,#64
815	ldp	x21,x22,[x29,#32]
816	ldp	x23,x24,[x29,#48]
817	ldp	x25,x26,[x29,#64]
818	ldp	x27,x28,[x29,#80]
819	ldp	x29,x30,[sp],#96
820	AARCH64_VALIDATE_LINK_REGISTER
821	ret
822.size	ChaCha20_neon,.-ChaCha20_neon
823.type	ChaCha20_512_neon,%function
824.align	5
825ChaCha20_512_neon:
826	AARCH64_SIGN_LINK_REGISTER
827	stp	x29,x30,[sp,#-96]!
828	add	x29,sp,#0
829
830	adrp	x5,.Lsigma
831	add	x5,x5,:lo12:.Lsigma
832	stp	x19,x20,[sp,#16]
833	stp	x21,x22,[sp,#32]
834	stp	x23,x24,[sp,#48]
835	stp	x25,x26,[sp,#64]
836	stp	x27,x28,[sp,#80]
837
838.L512_or_more_neon:
839	sub	sp,sp,#128+64
840
841	ldp	x22,x23,[x5]		// load sigma
842	ld1	{v24.4s},[x5],#16
843	ldp	x24,x25,[x3]		// load key
844	ldp	x26,x27,[x3,#16]
845	ld1	{v25.4s,v26.4s},[x3]
846	ldp	x28,x30,[x4]		// load counter
847	ld1	{v27.4s},[x4]
848	ld1	{v31.4s},[x5]
849#ifdef	__ARMEB__
850	rev64	v24.4s,v24.4s
851	ror	x24,x24,#32
852	ror	x25,x25,#32
853	ror	x26,x26,#32
854	ror	x27,x27,#32
855	ror	x28,x28,#32
856	ror	x30,x30,#32
857#endif
858	add	v27.4s,v27.4s,v31.4s		// += 1
859	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
860	add	v27.4s,v27.4s,v31.4s		// not typo
861	str	q26,[sp,#32]
862	add	v28.4s,v27.4s,v31.4s
863	add	v29.4s,v28.4s,v31.4s
864	add	v30.4s,v29.4s,v31.4s
865	shl	v31.4s,v31.4s,#2			// 1 -> 4
866
867	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
868	stp	d10,d11,[sp,#128+16]
869	stp	d12,d13,[sp,#128+32]
870	stp	d14,d15,[sp,#128+48]
871
872	sub	x2,x2,#512			// not typo
873
874.Loop_outer_512_neon:
875	mov	v0.16b,v24.16b
876	mov	v4.16b,v24.16b
877	mov	v8.16b,v24.16b
878	mov	v12.16b,v24.16b
879	mov	v16.16b,v24.16b
880	mov	v20.16b,v24.16b
881	mov	v1.16b,v25.16b
882	mov	w5,w22			// unpack key block
883	mov	v5.16b,v25.16b
884	lsr	x6,x22,#32
885	mov	v9.16b,v25.16b
886	mov	w7,w23
887	mov	v13.16b,v25.16b
888	lsr	x8,x23,#32
889	mov	v17.16b,v25.16b
890	mov	w9,w24
891	mov	v21.16b,v25.16b
892	lsr	x10,x24,#32
893	mov	v3.16b,v27.16b
894	mov	w11,w25
895	mov	v7.16b,v28.16b
896	lsr	x12,x25,#32
897	mov	v11.16b,v29.16b
898	mov	w13,w26
899	mov	v15.16b,v30.16b
900	lsr	x14,x26,#32
901	mov	v2.16b,v26.16b
902	mov	w15,w27
903	mov	v6.16b,v26.16b
904	lsr	x16,x27,#32
905	add	v19.4s,v3.4s,v31.4s			// +4
906	mov	w17,w28
907	add	v23.4s,v7.4s,v31.4s			// +4
908	lsr	x19,x28,#32
909	mov	v10.16b,v26.16b
910	mov	w20,w30
911	mov	v14.16b,v26.16b
912	lsr	x21,x30,#32
913	mov	v18.16b,v26.16b
914	stp	q27,q28,[sp,#48]		// off-load key block, variable part
915	mov	v22.16b,v26.16b
916	str	q29,[sp,#80]
917
918	mov	x4,#5
919	subs	x2,x2,#512
920.Loop_upper_neon:
921	sub	x4,x4,#1
922	add	v0.4s,v0.4s,v1.4s
923	add	w5,w5,w9
924	add	v4.4s,v4.4s,v5.4s
925	add	w6,w6,w10
926	add	v8.4s,v8.4s,v9.4s
927	add	w7,w7,w11
928	add	v12.4s,v12.4s,v13.4s
929	add	w8,w8,w12
930	add	v16.4s,v16.4s,v17.4s
931	eor	w17,w17,w5
932	add	v20.4s,v20.4s,v21.4s
933	eor	w19,w19,w6
934	eor	v3.16b,v3.16b,v0.16b
935	eor	w20,w20,w7
936	eor	v7.16b,v7.16b,v4.16b
937	eor	w21,w21,w8
938	eor	v11.16b,v11.16b,v8.16b
939	ror	w17,w17,#16
940	eor	v15.16b,v15.16b,v12.16b
941	ror	w19,w19,#16
942	eor	v19.16b,v19.16b,v16.16b
943	ror	w20,w20,#16
944	eor	v23.16b,v23.16b,v20.16b
945	ror	w21,w21,#16
946	rev32	v3.8h,v3.8h
947	add	w13,w13,w17
948	rev32	v7.8h,v7.8h
949	add	w14,w14,w19
950	rev32	v11.8h,v11.8h
951	add	w15,w15,w20
952	rev32	v15.8h,v15.8h
953	add	w16,w16,w21
954	rev32	v19.8h,v19.8h
955	eor	w9,w9,w13
956	rev32	v23.8h,v23.8h
957	eor	w10,w10,w14
958	add	v2.4s,v2.4s,v3.4s
959	eor	w11,w11,w15
960	add	v6.4s,v6.4s,v7.4s
961	eor	w12,w12,w16
962	add	v10.4s,v10.4s,v11.4s
963	ror	w9,w9,#20
964	add	v14.4s,v14.4s,v15.4s
965	ror	w10,w10,#20
966	add	v18.4s,v18.4s,v19.4s
967	ror	w11,w11,#20
968	add	v22.4s,v22.4s,v23.4s
969	ror	w12,w12,#20
970	eor	v24.16b,v1.16b,v2.16b
971	add	w5,w5,w9
972	eor	v25.16b,v5.16b,v6.16b
973	add	w6,w6,w10
974	eor	v26.16b,v9.16b,v10.16b
975	add	w7,w7,w11
976	eor	v27.16b,v13.16b,v14.16b
977	add	w8,w8,w12
978	eor	v28.16b,v17.16b,v18.16b
979	eor	w17,w17,w5
980	eor	v29.16b,v21.16b,v22.16b
981	eor	w19,w19,w6
982	ushr	v1.4s,v24.4s,#20
983	eor	w20,w20,w7
984	ushr	v5.4s,v25.4s,#20
985	eor	w21,w21,w8
986	ushr	v9.4s,v26.4s,#20
987	ror	w17,w17,#24
988	ushr	v13.4s,v27.4s,#20
989	ror	w19,w19,#24
990	ushr	v17.4s,v28.4s,#20
991	ror	w20,w20,#24
992	ushr	v21.4s,v29.4s,#20
993	ror	w21,w21,#24
994	sli	v1.4s,v24.4s,#12
995	add	w13,w13,w17
996	sli	v5.4s,v25.4s,#12
997	add	w14,w14,w19
998	sli	v9.4s,v26.4s,#12
999	add	w15,w15,w20
1000	sli	v13.4s,v27.4s,#12
1001	add	w16,w16,w21
1002	sli	v17.4s,v28.4s,#12
1003	eor	w9,w9,w13
1004	sli	v21.4s,v29.4s,#12
1005	eor	w10,w10,w14
1006	add	v0.4s,v0.4s,v1.4s
1007	eor	w11,w11,w15
1008	add	v4.4s,v4.4s,v5.4s
1009	eor	w12,w12,w16
1010	add	v8.4s,v8.4s,v9.4s
1011	ror	w9,w9,#25
1012	add	v12.4s,v12.4s,v13.4s
1013	ror	w10,w10,#25
1014	add	v16.4s,v16.4s,v17.4s
1015	ror	w11,w11,#25
1016	add	v20.4s,v20.4s,v21.4s
1017	ror	w12,w12,#25
1018	eor	v24.16b,v3.16b,v0.16b
1019	add	w5,w5,w10
1020	eor	v25.16b,v7.16b,v4.16b
1021	add	w6,w6,w11
1022	eor	v26.16b,v11.16b,v8.16b
1023	add	w7,w7,w12
1024	eor	v27.16b,v15.16b,v12.16b
1025	add	w8,w8,w9
1026	eor	v28.16b,v19.16b,v16.16b
1027	eor	w21,w21,w5
1028	eor	v29.16b,v23.16b,v20.16b
1029	eor	w17,w17,w6
1030	ushr	v3.4s,v24.4s,#24
1031	eor	w19,w19,w7
1032	ushr	v7.4s,v25.4s,#24
1033	eor	w20,w20,w8
1034	ushr	v11.4s,v26.4s,#24
1035	ror	w21,w21,#16
1036	ushr	v15.4s,v27.4s,#24
1037	ror	w17,w17,#16
1038	ushr	v19.4s,v28.4s,#24
1039	ror	w19,w19,#16
1040	ushr	v23.4s,v29.4s,#24
1041	ror	w20,w20,#16
1042	sli	v3.4s,v24.4s,#8
1043	add	w15,w15,w21
1044	sli	v7.4s,v25.4s,#8
1045	add	w16,w16,w17
1046	sli	v11.4s,v26.4s,#8
1047	add	w13,w13,w19
1048	sli	v15.4s,v27.4s,#8
1049	add	w14,w14,w20
1050	sli	v19.4s,v28.4s,#8
1051	eor	w10,w10,w15
1052	sli	v23.4s,v29.4s,#8
1053	eor	w11,w11,w16
1054	add	v2.4s,v2.4s,v3.4s
1055	eor	w12,w12,w13
1056	add	v6.4s,v6.4s,v7.4s
1057	eor	w9,w9,w14
1058	add	v10.4s,v10.4s,v11.4s
1059	ror	w10,w10,#20
1060	add	v14.4s,v14.4s,v15.4s
1061	ror	w11,w11,#20
1062	add	v18.4s,v18.4s,v19.4s
1063	ror	w12,w12,#20
1064	add	v22.4s,v22.4s,v23.4s
1065	ror	w9,w9,#20
1066	eor	v24.16b,v1.16b,v2.16b
1067	add	w5,w5,w10
1068	eor	v25.16b,v5.16b,v6.16b
1069	add	w6,w6,w11
1070	eor	v26.16b,v9.16b,v10.16b
1071	add	w7,w7,w12
1072	eor	v27.16b,v13.16b,v14.16b
1073	add	w8,w8,w9
1074	eor	v28.16b,v17.16b,v18.16b
1075	eor	w21,w21,w5
1076	eor	v29.16b,v21.16b,v22.16b
1077	eor	w17,w17,w6
1078	ushr	v1.4s,v24.4s,#25
1079	eor	w19,w19,w7
1080	ushr	v5.4s,v25.4s,#25
1081	eor	w20,w20,w8
1082	ushr	v9.4s,v26.4s,#25
1083	ror	w21,w21,#24
1084	ushr	v13.4s,v27.4s,#25
1085	ror	w17,w17,#24
1086	ushr	v17.4s,v28.4s,#25
1087	ror	w19,w19,#24
1088	ushr	v21.4s,v29.4s,#25
1089	ror	w20,w20,#24
1090	sli	v1.4s,v24.4s,#7
1091	add	w15,w15,w21
1092	sli	v5.4s,v25.4s,#7
1093	add	w16,w16,w17
1094	sli	v9.4s,v26.4s,#7
1095	add	w13,w13,w19
1096	sli	v13.4s,v27.4s,#7
1097	add	w14,w14,w20
1098	sli	v17.4s,v28.4s,#7
1099	eor	w10,w10,w15
1100	sli	v21.4s,v29.4s,#7
1101	eor	w11,w11,w16
1102	ext	v2.16b,v2.16b,v2.16b,#8
1103	eor	w12,w12,w13
1104	ext	v6.16b,v6.16b,v6.16b,#8
1105	eor	w9,w9,w14
1106	ext	v10.16b,v10.16b,v10.16b,#8
1107	ror	w10,w10,#25
1108	ext	v14.16b,v14.16b,v14.16b,#8
1109	ror	w11,w11,#25
1110	ext	v18.16b,v18.16b,v18.16b,#8
1111	ror	w12,w12,#25
1112	ext	v22.16b,v22.16b,v22.16b,#8
1113	ror	w9,w9,#25
1114	ext	v3.16b,v3.16b,v3.16b,#12
1115	ext	v7.16b,v7.16b,v7.16b,#12
1116	ext	v11.16b,v11.16b,v11.16b,#12
1117	ext	v15.16b,v15.16b,v15.16b,#12
1118	ext	v19.16b,v19.16b,v19.16b,#12
1119	ext	v23.16b,v23.16b,v23.16b,#12
1120	ext	v1.16b,v1.16b,v1.16b,#4
1121	ext	v5.16b,v5.16b,v5.16b,#4
1122	ext	v9.16b,v9.16b,v9.16b,#4
1123	ext	v13.16b,v13.16b,v13.16b,#4
1124	ext	v17.16b,v17.16b,v17.16b,#4
1125	ext	v21.16b,v21.16b,v21.16b,#4
1126	add	v0.4s,v0.4s,v1.4s
1127	add	w5,w5,w9
1128	add	v4.4s,v4.4s,v5.4s
1129	add	w6,w6,w10
1130	add	v8.4s,v8.4s,v9.4s
1131	add	w7,w7,w11
1132	add	v12.4s,v12.4s,v13.4s
1133	add	w8,w8,w12
1134	add	v16.4s,v16.4s,v17.4s
1135	eor	w17,w17,w5
1136	add	v20.4s,v20.4s,v21.4s
1137	eor	w19,w19,w6
1138	eor	v3.16b,v3.16b,v0.16b
1139	eor	w20,w20,w7
1140	eor	v7.16b,v7.16b,v4.16b
1141	eor	w21,w21,w8
1142	eor	v11.16b,v11.16b,v8.16b
1143	ror	w17,w17,#16
1144	eor	v15.16b,v15.16b,v12.16b
1145	ror	w19,w19,#16
1146	eor	v19.16b,v19.16b,v16.16b
1147	ror	w20,w20,#16
1148	eor	v23.16b,v23.16b,v20.16b
1149	ror	w21,w21,#16
1150	rev32	v3.8h,v3.8h
1151	add	w13,w13,w17
1152	rev32	v7.8h,v7.8h
1153	add	w14,w14,w19
1154	rev32	v11.8h,v11.8h
1155	add	w15,w15,w20
1156	rev32	v15.8h,v15.8h
1157	add	w16,w16,w21
1158	rev32	v19.8h,v19.8h
1159	eor	w9,w9,w13
1160	rev32	v23.8h,v23.8h
1161	eor	w10,w10,w14
1162	add	v2.4s,v2.4s,v3.4s
1163	eor	w11,w11,w15
1164	add	v6.4s,v6.4s,v7.4s
1165	eor	w12,w12,w16
1166	add	v10.4s,v10.4s,v11.4s
1167	ror	w9,w9,#20
1168	add	v14.4s,v14.4s,v15.4s
1169	ror	w10,w10,#20
1170	add	v18.4s,v18.4s,v19.4s
1171	ror	w11,w11,#20
1172	add	v22.4s,v22.4s,v23.4s
1173	ror	w12,w12,#20
1174	eor	v24.16b,v1.16b,v2.16b
1175	add	w5,w5,w9
1176	eor	v25.16b,v5.16b,v6.16b
1177	add	w6,w6,w10
1178	eor	v26.16b,v9.16b,v10.16b
1179	add	w7,w7,w11
1180	eor	v27.16b,v13.16b,v14.16b
1181	add	w8,w8,w12
1182	eor	v28.16b,v17.16b,v18.16b
1183	eor	w17,w17,w5
1184	eor	v29.16b,v21.16b,v22.16b
1185	eor	w19,w19,w6
1186	ushr	v1.4s,v24.4s,#20
1187	eor	w20,w20,w7
1188	ushr	v5.4s,v25.4s,#20
1189	eor	w21,w21,w8
1190	ushr	v9.4s,v26.4s,#20
1191	ror	w17,w17,#24
1192	ushr	v13.4s,v27.4s,#20
1193	ror	w19,w19,#24
1194	ushr	v17.4s,v28.4s,#20
1195	ror	w20,w20,#24
1196	ushr	v21.4s,v29.4s,#20
1197	ror	w21,w21,#24
1198	sli	v1.4s,v24.4s,#12
1199	add	w13,w13,w17
1200	sli	v5.4s,v25.4s,#12
1201	add	w14,w14,w19
1202	sli	v9.4s,v26.4s,#12
1203	add	w15,w15,w20
1204	sli	v13.4s,v27.4s,#12
1205	add	w16,w16,w21
1206	sli	v17.4s,v28.4s,#12
1207	eor	w9,w9,w13
1208	sli	v21.4s,v29.4s,#12
1209	eor	w10,w10,w14
1210	add	v0.4s,v0.4s,v1.4s
1211	eor	w11,w11,w15
1212	add	v4.4s,v4.4s,v5.4s
1213	eor	w12,w12,w16
1214	add	v8.4s,v8.4s,v9.4s
1215	ror	w9,w9,#25
1216	add	v12.4s,v12.4s,v13.4s
1217	ror	w10,w10,#25
1218	add	v16.4s,v16.4s,v17.4s
1219	ror	w11,w11,#25
1220	add	v20.4s,v20.4s,v21.4s
1221	ror	w12,w12,#25
1222	eor	v24.16b,v3.16b,v0.16b
1223	add	w5,w5,w10
1224	eor	v25.16b,v7.16b,v4.16b
1225	add	w6,w6,w11
1226	eor	v26.16b,v11.16b,v8.16b
1227	add	w7,w7,w12
1228	eor	v27.16b,v15.16b,v12.16b
1229	add	w8,w8,w9
1230	eor	v28.16b,v19.16b,v16.16b
1231	eor	w21,w21,w5
1232	eor	v29.16b,v23.16b,v20.16b
1233	eor	w17,w17,w6
1234	ushr	v3.4s,v24.4s,#24
1235	eor	w19,w19,w7
1236	ushr	v7.4s,v25.4s,#24
1237	eor	w20,w20,w8
1238	ushr	v11.4s,v26.4s,#24
1239	ror	w21,w21,#16
1240	ushr	v15.4s,v27.4s,#24
1241	ror	w17,w17,#16
1242	ushr	v19.4s,v28.4s,#24
1243	ror	w19,w19,#16
1244	ushr	v23.4s,v29.4s,#24
1245	ror	w20,w20,#16
1246	sli	v3.4s,v24.4s,#8
1247	add	w15,w15,w21
1248	sli	v7.4s,v25.4s,#8
1249	add	w16,w16,w17
1250	sli	v11.4s,v26.4s,#8
1251	add	w13,w13,w19
1252	sli	v15.4s,v27.4s,#8
1253	add	w14,w14,w20
1254	sli	v19.4s,v28.4s,#8
1255	eor	w10,w10,w15
1256	sli	v23.4s,v29.4s,#8
1257	eor	w11,w11,w16
1258	add	v2.4s,v2.4s,v3.4s
1259	eor	w12,w12,w13
1260	add	v6.4s,v6.4s,v7.4s
1261	eor	w9,w9,w14
1262	add	v10.4s,v10.4s,v11.4s
1263	ror	w10,w10,#20
1264	add	v14.4s,v14.4s,v15.4s
1265	ror	w11,w11,#20
1266	add	v18.4s,v18.4s,v19.4s
1267	ror	w12,w12,#20
1268	add	v22.4s,v22.4s,v23.4s
1269	ror	w9,w9,#20
1270	eor	v24.16b,v1.16b,v2.16b
1271	add	w5,w5,w10
1272	eor	v25.16b,v5.16b,v6.16b
1273	add	w6,w6,w11
1274	eor	v26.16b,v9.16b,v10.16b
1275	add	w7,w7,w12
1276	eor	v27.16b,v13.16b,v14.16b
1277	add	w8,w8,w9
1278	eor	v28.16b,v17.16b,v18.16b
1279	eor	w21,w21,w5
1280	eor	v29.16b,v21.16b,v22.16b
1281	eor	w17,w17,w6
1282	ushr	v1.4s,v24.4s,#25
1283	eor	w19,w19,w7
1284	ushr	v5.4s,v25.4s,#25
1285	eor	w20,w20,w8
1286	ushr	v9.4s,v26.4s,#25
1287	ror	w21,w21,#24
1288	ushr	v13.4s,v27.4s,#25
1289	ror	w17,w17,#24
1290	ushr	v17.4s,v28.4s,#25
1291	ror	w19,w19,#24
1292	ushr	v21.4s,v29.4s,#25
1293	ror	w20,w20,#24
1294	sli	v1.4s,v24.4s,#7
1295	add	w15,w15,w21
1296	sli	v5.4s,v25.4s,#7
1297	add	w16,w16,w17
1298	sli	v9.4s,v26.4s,#7
1299	add	w13,w13,w19
1300	sli	v13.4s,v27.4s,#7
1301	add	w14,w14,w20
1302	sli	v17.4s,v28.4s,#7
1303	eor	w10,w10,w15
1304	sli	v21.4s,v29.4s,#7
1305	eor	w11,w11,w16
1306	ext	v2.16b,v2.16b,v2.16b,#8
1307	eor	w12,w12,w13
1308	ext	v6.16b,v6.16b,v6.16b,#8
1309	eor	w9,w9,w14
1310	ext	v10.16b,v10.16b,v10.16b,#8
1311	ror	w10,w10,#25
1312	ext	v14.16b,v14.16b,v14.16b,#8
1313	ror	w11,w11,#25
1314	ext	v18.16b,v18.16b,v18.16b,#8
1315	ror	w12,w12,#25
1316	ext	v22.16b,v22.16b,v22.16b,#8
1317	ror	w9,w9,#25
1318	ext	v3.16b,v3.16b,v3.16b,#4
1319	ext	v7.16b,v7.16b,v7.16b,#4
1320	ext	v11.16b,v11.16b,v11.16b,#4
1321	ext	v15.16b,v15.16b,v15.16b,#4
1322	ext	v19.16b,v19.16b,v19.16b,#4
1323	ext	v23.16b,v23.16b,v23.16b,#4
1324	ext	v1.16b,v1.16b,v1.16b,#12
1325	ext	v5.16b,v5.16b,v5.16b,#12
1326	ext	v9.16b,v9.16b,v9.16b,#12
1327	ext	v13.16b,v13.16b,v13.16b,#12
1328	ext	v17.16b,v17.16b,v17.16b,#12
1329	ext	v21.16b,v21.16b,v21.16b,#12
1330	cbnz	x4,.Loop_upper_neon
1331
1332	add	w5,w5,w22		// accumulate key block
1333	add	x6,x6,x22,lsr#32
1334	add	w7,w7,w23
1335	add	x8,x8,x23,lsr#32
1336	add	w9,w9,w24
1337	add	x10,x10,x24,lsr#32
1338	add	w11,w11,w25
1339	add	x12,x12,x25,lsr#32
1340	add	w13,w13,w26
1341	add	x14,x14,x26,lsr#32
1342	add	w15,w15,w27
1343	add	x16,x16,x27,lsr#32
1344	add	w17,w17,w28
1345	add	x19,x19,x28,lsr#32
1346	add	w20,w20,w30
1347	add	x21,x21,x30,lsr#32
1348
1349	add	x5,x5,x6,lsl#32	// pack
1350	add	x7,x7,x8,lsl#32
1351	ldp	x6,x8,[x1,#0]		// load input
1352	add	x9,x9,x10,lsl#32
1353	add	x11,x11,x12,lsl#32
1354	ldp	x10,x12,[x1,#16]
1355	add	x13,x13,x14,lsl#32
1356	add	x15,x15,x16,lsl#32
1357	ldp	x14,x16,[x1,#32]
1358	add	x17,x17,x19,lsl#32
1359	add	x20,x20,x21,lsl#32
1360	ldp	x19,x21,[x1,#48]
1361	add	x1,x1,#64
1362#ifdef	__ARMEB__
1363	rev	x5,x5
1364	rev	x7,x7
1365	rev	x9,x9
1366	rev	x11,x11
1367	rev	x13,x13
1368	rev	x15,x15
1369	rev	x17,x17
1370	rev	x20,x20
1371#endif
1372	eor	x5,x5,x6
1373	eor	x7,x7,x8
1374	eor	x9,x9,x10
1375	eor	x11,x11,x12
1376	eor	x13,x13,x14
1377	eor	x15,x15,x16
1378	eor	x17,x17,x19
1379	eor	x20,x20,x21
1380
1381	stp	x5,x7,[x0,#0]		// store output
1382	add	x28,x28,#1			// increment counter
1383	mov	w5,w22			// unpack key block
1384	lsr	x6,x22,#32
1385	stp	x9,x11,[x0,#16]
1386	mov	w7,w23
1387	lsr	x8,x23,#32
1388	stp	x13,x15,[x0,#32]
1389	mov	w9,w24
1390	lsr	x10,x24,#32
1391	stp	x17,x20,[x0,#48]
1392	add	x0,x0,#64
1393	mov	w11,w25
1394	lsr	x12,x25,#32
1395	mov	w13,w26
1396	lsr	x14,x26,#32
1397	mov	w15,w27
1398	lsr	x16,x27,#32
1399	mov	w17,w28
1400	lsr	x19,x28,#32
1401	mov	w20,w30
1402	lsr	x21,x30,#32
1403
1404	mov	x4,#5
1405.Loop_lower_neon:
1406	sub	x4,x4,#1
1407	add	v0.4s,v0.4s,v1.4s
1408	add	w5,w5,w9
1409	add	v4.4s,v4.4s,v5.4s
1410	add	w6,w6,w10
1411	add	v8.4s,v8.4s,v9.4s
1412	add	w7,w7,w11
1413	add	v12.4s,v12.4s,v13.4s
1414	add	w8,w8,w12
1415	add	v16.4s,v16.4s,v17.4s
1416	eor	w17,w17,w5
1417	add	v20.4s,v20.4s,v21.4s
1418	eor	w19,w19,w6
1419	eor	v3.16b,v3.16b,v0.16b
1420	eor	w20,w20,w7
1421	eor	v7.16b,v7.16b,v4.16b
1422	eor	w21,w21,w8
1423	eor	v11.16b,v11.16b,v8.16b
1424	ror	w17,w17,#16
1425	eor	v15.16b,v15.16b,v12.16b
1426	ror	w19,w19,#16
1427	eor	v19.16b,v19.16b,v16.16b
1428	ror	w20,w20,#16
1429	eor	v23.16b,v23.16b,v20.16b
1430	ror	w21,w21,#16
1431	rev32	v3.8h,v3.8h
1432	add	w13,w13,w17
1433	rev32	v7.8h,v7.8h
1434	add	w14,w14,w19
1435	rev32	v11.8h,v11.8h
1436	add	w15,w15,w20
1437	rev32	v15.8h,v15.8h
1438	add	w16,w16,w21
1439	rev32	v19.8h,v19.8h
1440	eor	w9,w9,w13
1441	rev32	v23.8h,v23.8h
1442	eor	w10,w10,w14
1443	add	v2.4s,v2.4s,v3.4s
1444	eor	w11,w11,w15
1445	add	v6.4s,v6.4s,v7.4s
1446	eor	w12,w12,w16
1447	add	v10.4s,v10.4s,v11.4s
1448	ror	w9,w9,#20
1449	add	v14.4s,v14.4s,v15.4s
1450	ror	w10,w10,#20
1451	add	v18.4s,v18.4s,v19.4s
1452	ror	w11,w11,#20
1453	add	v22.4s,v22.4s,v23.4s
1454	ror	w12,w12,#20
1455	eor	v24.16b,v1.16b,v2.16b
1456	add	w5,w5,w9
1457	eor	v25.16b,v5.16b,v6.16b
1458	add	w6,w6,w10
1459	eor	v26.16b,v9.16b,v10.16b
1460	add	w7,w7,w11
1461	eor	v27.16b,v13.16b,v14.16b
1462	add	w8,w8,w12
1463	eor	v28.16b,v17.16b,v18.16b
1464	eor	w17,w17,w5
1465	eor	v29.16b,v21.16b,v22.16b
1466	eor	w19,w19,w6
1467	ushr	v1.4s,v24.4s,#20
1468	eor	w20,w20,w7
1469	ushr	v5.4s,v25.4s,#20
1470	eor	w21,w21,w8
1471	ushr	v9.4s,v26.4s,#20
1472	ror	w17,w17,#24
1473	ushr	v13.4s,v27.4s,#20
1474	ror	w19,w19,#24
1475	ushr	v17.4s,v28.4s,#20
1476	ror	w20,w20,#24
1477	ushr	v21.4s,v29.4s,#20
1478	ror	w21,w21,#24
1479	sli	v1.4s,v24.4s,#12
1480	add	w13,w13,w17
1481	sli	v5.4s,v25.4s,#12
1482	add	w14,w14,w19
1483	sli	v9.4s,v26.4s,#12
1484	add	w15,w15,w20
1485	sli	v13.4s,v27.4s,#12
1486	add	w16,w16,w21
1487	sli	v17.4s,v28.4s,#12
1488	eor	w9,w9,w13
1489	sli	v21.4s,v29.4s,#12
1490	eor	w10,w10,w14
1491	add	v0.4s,v0.4s,v1.4s
1492	eor	w11,w11,w15
1493	add	v4.4s,v4.4s,v5.4s
1494	eor	w12,w12,w16
1495	add	v8.4s,v8.4s,v9.4s
1496	ror	w9,w9,#25
1497	add	v12.4s,v12.4s,v13.4s
1498	ror	w10,w10,#25
1499	add	v16.4s,v16.4s,v17.4s
1500	ror	w11,w11,#25
1501	add	v20.4s,v20.4s,v21.4s
1502	ror	w12,w12,#25
1503	eor	v24.16b,v3.16b,v0.16b
1504	add	w5,w5,w10
1505	eor	v25.16b,v7.16b,v4.16b
1506	add	w6,w6,w11
1507	eor	v26.16b,v11.16b,v8.16b
1508	add	w7,w7,w12
1509	eor	v27.16b,v15.16b,v12.16b
1510	add	w8,w8,w9
1511	eor	v28.16b,v19.16b,v16.16b
1512	eor	w21,w21,w5
1513	eor	v29.16b,v23.16b,v20.16b
1514	eor	w17,w17,w6
1515	ushr	v3.4s,v24.4s,#24
1516	eor	w19,w19,w7
1517	ushr	v7.4s,v25.4s,#24
1518	eor	w20,w20,w8
1519	ushr	v11.4s,v26.4s,#24
1520	ror	w21,w21,#16
1521	ushr	v15.4s,v27.4s,#24
1522	ror	w17,w17,#16
1523	ushr	v19.4s,v28.4s,#24
1524	ror	w19,w19,#16
1525	ushr	v23.4s,v29.4s,#24
1526	ror	w20,w20,#16
1527	sli	v3.4s,v24.4s,#8
1528	add	w15,w15,w21
1529	sli	v7.4s,v25.4s,#8
1530	add	w16,w16,w17
1531	sli	v11.4s,v26.4s,#8
1532	add	w13,w13,w19
1533	sli	v15.4s,v27.4s,#8
1534	add	w14,w14,w20
1535	sli	v19.4s,v28.4s,#8
1536	eor	w10,w10,w15
1537	sli	v23.4s,v29.4s,#8
1538	eor	w11,w11,w16
1539	add	v2.4s,v2.4s,v3.4s
1540	eor	w12,w12,w13
1541	add	v6.4s,v6.4s,v7.4s
1542	eor	w9,w9,w14
1543	add	v10.4s,v10.4s,v11.4s
1544	ror	w10,w10,#20
1545	add	v14.4s,v14.4s,v15.4s
1546	ror	w11,w11,#20
1547	add	v18.4s,v18.4s,v19.4s
1548	ror	w12,w12,#20
1549	add	v22.4s,v22.4s,v23.4s
1550	ror	w9,w9,#20
1551	eor	v24.16b,v1.16b,v2.16b
1552	add	w5,w5,w10
1553	eor	v25.16b,v5.16b,v6.16b
1554	add	w6,w6,w11
1555	eor	v26.16b,v9.16b,v10.16b
1556	add	w7,w7,w12
1557	eor	v27.16b,v13.16b,v14.16b
1558	add	w8,w8,w9
1559	eor	v28.16b,v17.16b,v18.16b
1560	eor	w21,w21,w5
1561	eor	v29.16b,v21.16b,v22.16b
1562	eor	w17,w17,w6
1563	ushr	v1.4s,v24.4s,#25
1564	eor	w19,w19,w7
1565	ushr	v5.4s,v25.4s,#25
1566	eor	w20,w20,w8
1567	ushr	v9.4s,v26.4s,#25
1568	ror	w21,w21,#24
1569	ushr	v13.4s,v27.4s,#25
1570	ror	w17,w17,#24
1571	ushr	v17.4s,v28.4s,#25
1572	ror	w19,w19,#24
1573	ushr	v21.4s,v29.4s,#25
1574	ror	w20,w20,#24
1575	sli	v1.4s,v24.4s,#7
1576	add	w15,w15,w21
1577	sli	v5.4s,v25.4s,#7
1578	add	w16,w16,w17
1579	sli	v9.4s,v26.4s,#7
1580	add	w13,w13,w19
1581	sli	v13.4s,v27.4s,#7
1582	add	w14,w14,w20
1583	sli	v17.4s,v28.4s,#7
1584	eor	w10,w10,w15
1585	sli	v21.4s,v29.4s,#7
1586	eor	w11,w11,w16
1587	ext	v2.16b,v2.16b,v2.16b,#8
1588	eor	w12,w12,w13
1589	ext	v6.16b,v6.16b,v6.16b,#8
1590	eor	w9,w9,w14
1591	ext	v10.16b,v10.16b,v10.16b,#8
1592	ror	w10,w10,#25
1593	ext	v14.16b,v14.16b,v14.16b,#8
1594	ror	w11,w11,#25
1595	ext	v18.16b,v18.16b,v18.16b,#8
1596	ror	w12,w12,#25
1597	ext	v22.16b,v22.16b,v22.16b,#8
1598	ror	w9,w9,#25
1599	ext	v3.16b,v3.16b,v3.16b,#12
1600	ext	v7.16b,v7.16b,v7.16b,#12
1601	ext	v11.16b,v11.16b,v11.16b,#12
1602	ext	v15.16b,v15.16b,v15.16b,#12
1603	ext	v19.16b,v19.16b,v19.16b,#12
1604	ext	v23.16b,v23.16b,v23.16b,#12
1605	ext	v1.16b,v1.16b,v1.16b,#4
1606	ext	v5.16b,v5.16b,v5.16b,#4
1607	ext	v9.16b,v9.16b,v9.16b,#4
1608	ext	v13.16b,v13.16b,v13.16b,#4
1609	ext	v17.16b,v17.16b,v17.16b,#4
1610	ext	v21.16b,v21.16b,v21.16b,#4
1611	add	v0.4s,v0.4s,v1.4s
1612	add	w5,w5,w9
1613	add	v4.4s,v4.4s,v5.4s
1614	add	w6,w6,w10
1615	add	v8.4s,v8.4s,v9.4s
1616	add	w7,w7,w11
1617	add	v12.4s,v12.4s,v13.4s
1618	add	w8,w8,w12
1619	add	v16.4s,v16.4s,v17.4s
1620	eor	w17,w17,w5
1621	add	v20.4s,v20.4s,v21.4s
1622	eor	w19,w19,w6
1623	eor	v3.16b,v3.16b,v0.16b
1624	eor	w20,w20,w7
1625	eor	v7.16b,v7.16b,v4.16b
1626	eor	w21,w21,w8
1627	eor	v11.16b,v11.16b,v8.16b
1628	ror	w17,w17,#16
1629	eor	v15.16b,v15.16b,v12.16b
1630	ror	w19,w19,#16
1631	eor	v19.16b,v19.16b,v16.16b
1632	ror	w20,w20,#16
1633	eor	v23.16b,v23.16b,v20.16b
1634	ror	w21,w21,#16
1635	rev32	v3.8h,v3.8h
1636	add	w13,w13,w17
1637	rev32	v7.8h,v7.8h
1638	add	w14,w14,w19
1639	rev32	v11.8h,v11.8h
1640	add	w15,w15,w20
1641	rev32	v15.8h,v15.8h
1642	add	w16,w16,w21
1643	rev32	v19.8h,v19.8h
1644	eor	w9,w9,w13
1645	rev32	v23.8h,v23.8h
1646	eor	w10,w10,w14
1647	add	v2.4s,v2.4s,v3.4s
1648	eor	w11,w11,w15
1649	add	v6.4s,v6.4s,v7.4s
1650	eor	w12,w12,w16
1651	add	v10.4s,v10.4s,v11.4s
1652	ror	w9,w9,#20
1653	add	v14.4s,v14.4s,v15.4s
1654	ror	w10,w10,#20
1655	add	v18.4s,v18.4s,v19.4s
1656	ror	w11,w11,#20
1657	add	v22.4s,v22.4s,v23.4s
1658	ror	w12,w12,#20
1659	eor	v24.16b,v1.16b,v2.16b
1660	add	w5,w5,w9
1661	eor	v25.16b,v5.16b,v6.16b
1662	add	w6,w6,w10
1663	eor	v26.16b,v9.16b,v10.16b
1664	add	w7,w7,w11
1665	eor	v27.16b,v13.16b,v14.16b
1666	add	w8,w8,w12
1667	eor	v28.16b,v17.16b,v18.16b
1668	eor	w17,w17,w5
1669	eor	v29.16b,v21.16b,v22.16b
1670	eor	w19,w19,w6
1671	ushr	v1.4s,v24.4s,#20
1672	eor	w20,w20,w7
1673	ushr	v5.4s,v25.4s,#20
1674	eor	w21,w21,w8
1675	ushr	v9.4s,v26.4s,#20
1676	ror	w17,w17,#24
1677	ushr	v13.4s,v27.4s,#20
1678	ror	w19,w19,#24
1679	ushr	v17.4s,v28.4s,#20
1680	ror	w20,w20,#24
1681	ushr	v21.4s,v29.4s,#20
1682	ror	w21,w21,#24
1683	sli	v1.4s,v24.4s,#12
1684	add	w13,w13,w17
1685	sli	v5.4s,v25.4s,#12
1686	add	w14,w14,w19
1687	sli	v9.4s,v26.4s,#12
1688	add	w15,w15,w20
1689	sli	v13.4s,v27.4s,#12
1690	add	w16,w16,w21
1691	sli	v17.4s,v28.4s,#12
1692	eor	w9,w9,w13
1693	sli	v21.4s,v29.4s,#12
1694	eor	w10,w10,w14
1695	add	v0.4s,v0.4s,v1.4s
1696	eor	w11,w11,w15
1697	add	v4.4s,v4.4s,v5.4s
1698	eor	w12,w12,w16
1699	add	v8.4s,v8.4s,v9.4s
1700	ror	w9,w9,#25
1701	add	v12.4s,v12.4s,v13.4s
1702	ror	w10,w10,#25
1703	add	v16.4s,v16.4s,v17.4s
1704	ror	w11,w11,#25
1705	add	v20.4s,v20.4s,v21.4s
1706	ror	w12,w12,#25
1707	eor	v24.16b,v3.16b,v0.16b
1708	add	w5,w5,w10
1709	eor	v25.16b,v7.16b,v4.16b
1710	add	w6,w6,w11
1711	eor	v26.16b,v11.16b,v8.16b
1712	add	w7,w7,w12
1713	eor	v27.16b,v15.16b,v12.16b
1714	add	w8,w8,w9
1715	eor	v28.16b,v19.16b,v16.16b
1716	eor	w21,w21,w5
1717	eor	v29.16b,v23.16b,v20.16b
1718	eor	w17,w17,w6
1719	ushr	v3.4s,v24.4s,#24
1720	eor	w19,w19,w7
1721	ushr	v7.4s,v25.4s,#24
1722	eor	w20,w20,w8
1723	ushr	v11.4s,v26.4s,#24
1724	ror	w21,w21,#16
1725	ushr	v15.4s,v27.4s,#24
1726	ror	w17,w17,#16
1727	ushr	v19.4s,v28.4s,#24
1728	ror	w19,w19,#16
1729	ushr	v23.4s,v29.4s,#24
1730	ror	w20,w20,#16
1731	sli	v3.4s,v24.4s,#8
1732	add	w15,w15,w21
1733	sli	v7.4s,v25.4s,#8
1734	add	w16,w16,w17
1735	sli	v11.4s,v26.4s,#8
1736	add	w13,w13,w19
1737	sli	v15.4s,v27.4s,#8
1738	add	w14,w14,w20
1739	sli	v19.4s,v28.4s,#8
1740	eor	w10,w10,w15
1741	sli	v23.4s,v29.4s,#8
1742	eor	w11,w11,w16
1743	add	v2.4s,v2.4s,v3.4s
1744	eor	w12,w12,w13
1745	add	v6.4s,v6.4s,v7.4s
1746	eor	w9,w9,w14
1747	add	v10.4s,v10.4s,v11.4s
1748	ror	w10,w10,#20
1749	add	v14.4s,v14.4s,v15.4s
1750	ror	w11,w11,#20
1751	add	v18.4s,v18.4s,v19.4s
1752	ror	w12,w12,#20
1753	add	v22.4s,v22.4s,v23.4s
1754	ror	w9,w9,#20
1755	eor	v24.16b,v1.16b,v2.16b
1756	add	w5,w5,w10
1757	eor	v25.16b,v5.16b,v6.16b
1758	add	w6,w6,w11
1759	eor	v26.16b,v9.16b,v10.16b
1760	add	w7,w7,w12
1761	eor	v27.16b,v13.16b,v14.16b
1762	add	w8,w8,w9
1763	eor	v28.16b,v17.16b,v18.16b
1764	eor	w21,w21,w5
1765	eor	v29.16b,v21.16b,v22.16b
1766	eor	w17,w17,w6
1767	ushr	v1.4s,v24.4s,#25
1768	eor	w19,w19,w7
1769	ushr	v5.4s,v25.4s,#25
1770	eor	w20,w20,w8
1771	ushr	v9.4s,v26.4s,#25
1772	ror	w21,w21,#24
1773	ushr	v13.4s,v27.4s,#25
1774	ror	w17,w17,#24
1775	ushr	v17.4s,v28.4s,#25
1776	ror	w19,w19,#24
1777	ushr	v21.4s,v29.4s,#25
1778	ror	w20,w20,#24
1779	sli	v1.4s,v24.4s,#7
1780	add	w15,w15,w21
1781	sli	v5.4s,v25.4s,#7
1782	add	w16,w16,w17
1783	sli	v9.4s,v26.4s,#7
1784	add	w13,w13,w19
1785	sli	v13.4s,v27.4s,#7
1786	add	w14,w14,w20
1787	sli	v17.4s,v28.4s,#7
1788	eor	w10,w10,w15
1789	sli	v21.4s,v29.4s,#7
1790	eor	w11,w11,w16
1791	ext	v2.16b,v2.16b,v2.16b,#8
1792	eor	w12,w12,w13
1793	ext	v6.16b,v6.16b,v6.16b,#8
1794	eor	w9,w9,w14
1795	ext	v10.16b,v10.16b,v10.16b,#8
1796	ror	w10,w10,#25
1797	ext	v14.16b,v14.16b,v14.16b,#8
1798	ror	w11,w11,#25
1799	ext	v18.16b,v18.16b,v18.16b,#8
1800	ror	w12,w12,#25
1801	ext	v22.16b,v22.16b,v22.16b,#8
1802	ror	w9,w9,#25
1803	ext	v3.16b,v3.16b,v3.16b,#4
1804	ext	v7.16b,v7.16b,v7.16b,#4
1805	ext	v11.16b,v11.16b,v11.16b,#4
1806	ext	v15.16b,v15.16b,v15.16b,#4
1807	ext	v19.16b,v19.16b,v19.16b,#4
1808	ext	v23.16b,v23.16b,v23.16b,#4
1809	ext	v1.16b,v1.16b,v1.16b,#12
1810	ext	v5.16b,v5.16b,v5.16b,#12
1811	ext	v9.16b,v9.16b,v9.16b,#12
1812	ext	v13.16b,v13.16b,v13.16b,#12
1813	ext	v17.16b,v17.16b,v17.16b,#12
1814	ext	v21.16b,v21.16b,v21.16b,#12
1815	cbnz	x4,.Loop_lower_neon
1816
1817	add	w5,w5,w22		// accumulate key block
1818	ldp	q24,q25,[sp,#0]
1819	add	x6,x6,x22,lsr#32
1820	ldp	q26,q27,[sp,#32]
1821	add	w7,w7,w23
1822	ldp	q28,q29,[sp,#64]
1823	add	x8,x8,x23,lsr#32
1824	add	v0.4s,v0.4s,v24.4s
1825	add	w9,w9,w24
1826	add	v4.4s,v4.4s,v24.4s
1827	add	x10,x10,x24,lsr#32
1828	add	v8.4s,v8.4s,v24.4s
1829	add	w11,w11,w25
1830	add	v12.4s,v12.4s,v24.4s
1831	add	x12,x12,x25,lsr#32
1832	add	v16.4s,v16.4s,v24.4s
1833	add	w13,w13,w26
1834	add	v20.4s,v20.4s,v24.4s
1835	add	x14,x14,x26,lsr#32
1836	add	v2.4s,v2.4s,v26.4s
1837	add	w15,w15,w27
1838	add	v6.4s,v6.4s,v26.4s
1839	add	x16,x16,x27,lsr#32
1840	add	v10.4s,v10.4s,v26.4s
1841	add	w17,w17,w28
1842	add	v14.4s,v14.4s,v26.4s
1843	add	x19,x19,x28,lsr#32
1844	add	v18.4s,v18.4s,v26.4s
1845	add	w20,w20,w30
1846	add	v22.4s,v22.4s,v26.4s
1847	add	x21,x21,x30,lsr#32
1848	add	v19.4s,v19.4s,v31.4s			// +4
1849	add	x5,x5,x6,lsl#32	// pack
1850	add	v23.4s,v23.4s,v31.4s			// +4
1851	add	x7,x7,x8,lsl#32
1852	add	v3.4s,v3.4s,v27.4s
1853	ldp	x6,x8,[x1,#0]		// load input
1854	add	v7.4s,v7.4s,v28.4s
1855	add	x9,x9,x10,lsl#32
1856	add	v11.4s,v11.4s,v29.4s
1857	add	x11,x11,x12,lsl#32
1858	add	v15.4s,v15.4s,v30.4s
1859	ldp	x10,x12,[x1,#16]
1860	add	v19.4s,v19.4s,v27.4s
1861	add	x13,x13,x14,lsl#32
1862	add	v23.4s,v23.4s,v28.4s
1863	add	x15,x15,x16,lsl#32
1864	add	v1.4s,v1.4s,v25.4s
1865	ldp	x14,x16,[x1,#32]
1866	add	v5.4s,v5.4s,v25.4s
1867	add	x17,x17,x19,lsl#32
1868	add	v9.4s,v9.4s,v25.4s
1869	add	x20,x20,x21,lsl#32
1870	add	v13.4s,v13.4s,v25.4s
1871	ldp	x19,x21,[x1,#48]
1872	add	v17.4s,v17.4s,v25.4s
1873	add	x1,x1,#64
1874	add	v21.4s,v21.4s,v25.4s
1875
1876#ifdef	__ARMEB__
1877	rev	x5,x5
1878	rev	x7,x7
1879	rev	x9,x9
1880	rev	x11,x11
1881	rev	x13,x13
1882	rev	x15,x15
1883	rev	x17,x17
1884	rev	x20,x20
1885#endif
1886	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1887	eor	x5,x5,x6
1888	eor	x7,x7,x8
1889	eor	x9,x9,x10
1890	eor	x11,x11,x12
1891	eor	x13,x13,x14
1892	eor	v0.16b,v0.16b,v24.16b
1893	eor	x15,x15,x16
1894	eor	v1.16b,v1.16b,v25.16b
1895	eor	x17,x17,x19
1896	eor	v2.16b,v2.16b,v26.16b
1897	eor	x20,x20,x21
1898	eor	v3.16b,v3.16b,v27.16b
1899	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1900
1901	stp	x5,x7,[x0,#0]		// store output
1902	add	x28,x28,#7			// increment counter
1903	stp	x9,x11,[x0,#16]
1904	stp	x13,x15,[x0,#32]
1905	stp	x17,x20,[x0,#48]
1906	add	x0,x0,#64
1907	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1908
1909	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1910	eor	v4.16b,v4.16b,v24.16b
1911	eor	v5.16b,v5.16b,v25.16b
1912	eor	v6.16b,v6.16b,v26.16b
1913	eor	v7.16b,v7.16b,v27.16b
1914	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1915
1916	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1917	eor	v8.16b,v8.16b,v0.16b
1918	ldp	q24,q25,[sp,#0]
1919	eor	v9.16b,v9.16b,v1.16b
1920	ldp	q26,q27,[sp,#32]
1921	eor	v10.16b,v10.16b,v2.16b
1922	eor	v11.16b,v11.16b,v3.16b
1923	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1924
1925	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1926	eor	v12.16b,v12.16b,v4.16b
1927	eor	v13.16b,v13.16b,v5.16b
1928	eor	v14.16b,v14.16b,v6.16b
1929	eor	v15.16b,v15.16b,v7.16b
1930	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1931
1932	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1933	eor	v16.16b,v16.16b,v8.16b
1934	eor	v17.16b,v17.16b,v9.16b
1935	eor	v18.16b,v18.16b,v10.16b
1936	eor	v19.16b,v19.16b,v11.16b
1937	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1938
1939	shl	v0.4s,v31.4s,#1			// 4 -> 8
1940	eor	v20.16b,v20.16b,v12.16b
1941	eor	v21.16b,v21.16b,v13.16b
1942	eor	v22.16b,v22.16b,v14.16b
1943	eor	v23.16b,v23.16b,v15.16b
1944	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1945
1946	add	v27.4s,v27.4s,v0.4s			// += 8
1947	add	v28.4s,v28.4s,v0.4s
1948	add	v29.4s,v29.4s,v0.4s
1949	add	v30.4s,v30.4s,v0.4s
1950
1951	b.hs	.Loop_outer_512_neon
1952
1953	adds	x2,x2,#512
1954	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1955
1956	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1957	ldp	d10,d11,[sp,#128+16]
1958	ldp	d12,d13,[sp,#128+32]
1959	ldp	d14,d15,[sp,#128+48]
1960
1961	stp	q24,q31,[sp,#0]		// wipe off-load area
1962	stp	q24,q31,[sp,#32]
1963	stp	q24,q31,[sp,#64]
1964
1965	b.eq	.Ldone_512_neon
1966
1967	cmp	x2,#192
1968	sub	v27.4s,v27.4s,v0.4s			// -= 1
1969	sub	v28.4s,v28.4s,v0.4s
1970	sub	v29.4s,v29.4s,v0.4s
1971	add	sp,sp,#128
1972	b.hs	.Loop_outer_neon
1973
1974	eor	v25.16b,v25.16b,v25.16b
1975	eor	v26.16b,v26.16b,v26.16b
1976	eor	v27.16b,v27.16b,v27.16b
1977	eor	v28.16b,v28.16b,v28.16b
1978	eor	v29.16b,v29.16b,v29.16b
1979	eor	v30.16b,v30.16b,v30.16b
1980	b	.Loop_outer
1981
1982.Ldone_512_neon:
1983	ldp	x19,x20,[x29,#16]
1984	add	sp,sp,#128+64
1985	ldp	x21,x22,[x29,#32]
1986	ldp	x23,x24,[x29,#48]
1987	ldp	x25,x26,[x29,#64]
1988	ldp	x27,x28,[x29,#80]
1989	ldp	x29,x30,[sp],#96
1990	AARCH64_VALIDATE_LINK_REGISTER
1991	ret
1992.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1993#endif
1994#endif  // !OPENSSL_NO_ASM
1995.section	.note.GNU-stack,"",%progbits
1996