xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/crypto/chacha-armv8-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8
9.section	.rodata
10
11.align	5
12Lsigma:
13.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
14Lone:
15.long	1,0,0,0
16.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
17.align	2
18
19.text
20
21.globl	ChaCha20_ctr32_nohw
22
23.def ChaCha20_ctr32_nohw
24   .type 32
25.endef
26.align	5
27ChaCha20_ctr32_nohw:
28	AARCH64_SIGN_LINK_REGISTER
29	stp	x29,x30,[sp,#-96]!
30	add	x29,sp,#0
31
32	adrp	x5,Lsigma
33	add	x5,x5,:lo12:Lsigma
34	stp	x19,x20,[sp,#16]
35	stp	x21,x22,[sp,#32]
36	stp	x23,x24,[sp,#48]
37	stp	x25,x26,[sp,#64]
38	stp	x27,x28,[sp,#80]
39	sub	sp,sp,#64
40
41	ldp	x22,x23,[x5]		// load sigma
42	ldp	x24,x25,[x3]		// load key
43	ldp	x26,x27,[x3,#16]
44	ldp	x28,x30,[x4]		// load counter
45#ifdef	__AARCH64EB__
46	ror	x24,x24,#32
47	ror	x25,x25,#32
48	ror	x26,x26,#32
49	ror	x27,x27,#32
50	ror	x28,x28,#32
51	ror	x30,x30,#32
52#endif
53
54Loop_outer:
55	mov	w5,w22			// unpack key block
56	lsr	x6,x22,#32
57	mov	w7,w23
58	lsr	x8,x23,#32
59	mov	w9,w24
60	lsr	x10,x24,#32
61	mov	w11,w25
62	lsr	x12,x25,#32
63	mov	w13,w26
64	lsr	x14,x26,#32
65	mov	w15,w27
66	lsr	x16,x27,#32
67	mov	w17,w28
68	lsr	x19,x28,#32
69	mov	w20,w30
70	lsr	x21,x30,#32
71
72	mov	x4,#10
73	subs	x2,x2,#64
74Loop:
75	sub	x4,x4,#1
76	add	w5,w5,w9
77	add	w6,w6,w10
78	add	w7,w7,w11
79	add	w8,w8,w12
80	eor	w17,w17,w5
81	eor	w19,w19,w6
82	eor	w20,w20,w7
83	eor	w21,w21,w8
84	ror	w17,w17,#16
85	ror	w19,w19,#16
86	ror	w20,w20,#16
87	ror	w21,w21,#16
88	add	w13,w13,w17
89	add	w14,w14,w19
90	add	w15,w15,w20
91	add	w16,w16,w21
92	eor	w9,w9,w13
93	eor	w10,w10,w14
94	eor	w11,w11,w15
95	eor	w12,w12,w16
96	ror	w9,w9,#20
97	ror	w10,w10,#20
98	ror	w11,w11,#20
99	ror	w12,w12,#20
100	add	w5,w5,w9
101	add	w6,w6,w10
102	add	w7,w7,w11
103	add	w8,w8,w12
104	eor	w17,w17,w5
105	eor	w19,w19,w6
106	eor	w20,w20,w7
107	eor	w21,w21,w8
108	ror	w17,w17,#24
109	ror	w19,w19,#24
110	ror	w20,w20,#24
111	ror	w21,w21,#24
112	add	w13,w13,w17
113	add	w14,w14,w19
114	add	w15,w15,w20
115	add	w16,w16,w21
116	eor	w9,w9,w13
117	eor	w10,w10,w14
118	eor	w11,w11,w15
119	eor	w12,w12,w16
120	ror	w9,w9,#25
121	ror	w10,w10,#25
122	ror	w11,w11,#25
123	ror	w12,w12,#25
124	add	w5,w5,w10
125	add	w6,w6,w11
126	add	w7,w7,w12
127	add	w8,w8,w9
128	eor	w21,w21,w5
129	eor	w17,w17,w6
130	eor	w19,w19,w7
131	eor	w20,w20,w8
132	ror	w21,w21,#16
133	ror	w17,w17,#16
134	ror	w19,w19,#16
135	ror	w20,w20,#16
136	add	w15,w15,w21
137	add	w16,w16,w17
138	add	w13,w13,w19
139	add	w14,w14,w20
140	eor	w10,w10,w15
141	eor	w11,w11,w16
142	eor	w12,w12,w13
143	eor	w9,w9,w14
144	ror	w10,w10,#20
145	ror	w11,w11,#20
146	ror	w12,w12,#20
147	ror	w9,w9,#20
148	add	w5,w5,w10
149	add	w6,w6,w11
150	add	w7,w7,w12
151	add	w8,w8,w9
152	eor	w21,w21,w5
153	eor	w17,w17,w6
154	eor	w19,w19,w7
155	eor	w20,w20,w8
156	ror	w21,w21,#24
157	ror	w17,w17,#24
158	ror	w19,w19,#24
159	ror	w20,w20,#24
160	add	w15,w15,w21
161	add	w16,w16,w17
162	add	w13,w13,w19
163	add	w14,w14,w20
164	eor	w10,w10,w15
165	eor	w11,w11,w16
166	eor	w12,w12,w13
167	eor	w9,w9,w14
168	ror	w10,w10,#25
169	ror	w11,w11,#25
170	ror	w12,w12,#25
171	ror	w9,w9,#25
172	cbnz	x4,Loop
173
174	add	w5,w5,w22		// accumulate key block
175	add	x6,x6,x22,lsr#32
176	add	w7,w7,w23
177	add	x8,x8,x23,lsr#32
178	add	w9,w9,w24
179	add	x10,x10,x24,lsr#32
180	add	w11,w11,w25
181	add	x12,x12,x25,lsr#32
182	add	w13,w13,w26
183	add	x14,x14,x26,lsr#32
184	add	w15,w15,w27
185	add	x16,x16,x27,lsr#32
186	add	w17,w17,w28
187	add	x19,x19,x28,lsr#32
188	add	w20,w20,w30
189	add	x21,x21,x30,lsr#32
190
191	b.lo	Ltail
192
193	add	x5,x5,x6,lsl#32	// pack
194	add	x7,x7,x8,lsl#32
195	ldp	x6,x8,[x1,#0]		// load input
196	add	x9,x9,x10,lsl#32
197	add	x11,x11,x12,lsl#32
198	ldp	x10,x12,[x1,#16]
199	add	x13,x13,x14,lsl#32
200	add	x15,x15,x16,lsl#32
201	ldp	x14,x16,[x1,#32]
202	add	x17,x17,x19,lsl#32
203	add	x20,x20,x21,lsl#32
204	ldp	x19,x21,[x1,#48]
205	add	x1,x1,#64
206#ifdef	__AARCH64EB__
207	rev	x5,x5
208	rev	x7,x7
209	rev	x9,x9
210	rev	x11,x11
211	rev	x13,x13
212	rev	x15,x15
213	rev	x17,x17
214	rev	x20,x20
215#endif
216	eor	x5,x5,x6
217	eor	x7,x7,x8
218	eor	x9,x9,x10
219	eor	x11,x11,x12
220	eor	x13,x13,x14
221	eor	x15,x15,x16
222	eor	x17,x17,x19
223	eor	x20,x20,x21
224
225	stp	x5,x7,[x0,#0]		// store output
226	add	x28,x28,#1			// increment counter
227	stp	x9,x11,[x0,#16]
228	stp	x13,x15,[x0,#32]
229	stp	x17,x20,[x0,#48]
230	add	x0,x0,#64
231
232	b.hi	Loop_outer
233
234	ldp	x19,x20,[x29,#16]
235	add	sp,sp,#64
236	ldp	x21,x22,[x29,#32]
237	ldp	x23,x24,[x29,#48]
238	ldp	x25,x26,[x29,#64]
239	ldp	x27,x28,[x29,#80]
240	ldp	x29,x30,[sp],#96
241	AARCH64_VALIDATE_LINK_REGISTER
242	ret
243
244.align	4
245Ltail:
246	add	x2,x2,#64
247Less_than_64:
248	sub	x0,x0,#1
249	add	x1,x1,x2
250	add	x0,x0,x2
251	add	x4,sp,x2
252	neg	x2,x2
253
254	add	x5,x5,x6,lsl#32	// pack
255	add	x7,x7,x8,lsl#32
256	add	x9,x9,x10,lsl#32
257	add	x11,x11,x12,lsl#32
258	add	x13,x13,x14,lsl#32
259	add	x15,x15,x16,lsl#32
260	add	x17,x17,x19,lsl#32
261	add	x20,x20,x21,lsl#32
262#ifdef	__AARCH64EB__
263	rev	x5,x5
264	rev	x7,x7
265	rev	x9,x9
266	rev	x11,x11
267	rev	x13,x13
268	rev	x15,x15
269	rev	x17,x17
270	rev	x20,x20
271#endif
272	stp	x5,x7,[sp,#0]
273	stp	x9,x11,[sp,#16]
274	stp	x13,x15,[sp,#32]
275	stp	x17,x20,[sp,#48]
276
277Loop_tail:
278	ldrb	w10,[x1,x2]
279	ldrb	w11,[x4,x2]
280	add	x2,x2,#1
281	eor	w10,w10,w11
282	strb	w10,[x0,x2]
283	cbnz	x2,Loop_tail
284
285	stp	xzr,xzr,[sp,#0]
286	stp	xzr,xzr,[sp,#16]
287	stp	xzr,xzr,[sp,#32]
288	stp	xzr,xzr,[sp,#48]
289
290	ldp	x19,x20,[x29,#16]
291	add	sp,sp,#64
292	ldp	x21,x22,[x29,#32]
293	ldp	x23,x24,[x29,#48]
294	ldp	x25,x26,[x29,#64]
295	ldp	x27,x28,[x29,#80]
296	ldp	x29,x30,[sp],#96
297	AARCH64_VALIDATE_LINK_REGISTER
298	ret
299
300
301.globl	ChaCha20_ctr32_neon
302
303.def ChaCha20_ctr32_neon
304   .type 32
305.endef
306.align	5
307ChaCha20_ctr32_neon:
308	AARCH64_SIGN_LINK_REGISTER
309	stp	x29,x30,[sp,#-96]!
310	add	x29,sp,#0
311
312	adrp	x5,Lsigma
313	add	x5,x5,:lo12:Lsigma
314	stp	x19,x20,[sp,#16]
315	stp	x21,x22,[sp,#32]
316	stp	x23,x24,[sp,#48]
317	stp	x25,x26,[sp,#64]
318	stp	x27,x28,[sp,#80]
319	cmp	x2,#512
320	b.hs	L512_or_more_neon
321
322	sub	sp,sp,#64
323
324	ldp	x22,x23,[x5]		// load sigma
325	ld1	{v24.4s},[x5],#16
326	ldp	x24,x25,[x3]		// load key
327	ldp	x26,x27,[x3,#16]
328	ld1	{v25.4s,v26.4s},[x3]
329	ldp	x28,x30,[x4]		// load counter
330	ld1	{v27.4s},[x4]
331	ld1	{v31.4s},[x5]
332#ifdef	__AARCH64EB__
333	rev64	v24.4s,v24.4s
334	ror	x24,x24,#32
335	ror	x25,x25,#32
336	ror	x26,x26,#32
337	ror	x27,x27,#32
338	ror	x28,x28,#32
339	ror	x30,x30,#32
340#endif
341	add	v27.4s,v27.4s,v31.4s		// += 1
342	add	v28.4s,v27.4s,v31.4s
343	add	v29.4s,v28.4s,v31.4s
344	shl	v31.4s,v31.4s,#2			// 1 -> 4
345
346Loop_outer_neon:
347	mov	w5,w22			// unpack key block
348	lsr	x6,x22,#32
349	mov	v0.16b,v24.16b
350	mov	w7,w23
351	lsr	x8,x23,#32
352	mov	v4.16b,v24.16b
353	mov	w9,w24
354	lsr	x10,x24,#32
355	mov	v16.16b,v24.16b
356	mov	w11,w25
357	mov	v1.16b,v25.16b
358	lsr	x12,x25,#32
359	mov	v5.16b,v25.16b
360	mov	w13,w26
361	mov	v17.16b,v25.16b
362	lsr	x14,x26,#32
363	mov	v3.16b,v27.16b
364	mov	w15,w27
365	mov	v7.16b,v28.16b
366	lsr	x16,x27,#32
367	mov	v19.16b,v29.16b
368	mov	w17,w28
369	mov	v2.16b,v26.16b
370	lsr	x19,x28,#32
371	mov	v6.16b,v26.16b
372	mov	w20,w30
373	mov	v18.16b,v26.16b
374	lsr	x21,x30,#32
375
376	mov	x4,#10
377	subs	x2,x2,#256
378Loop_neon:
379	sub	x4,x4,#1
380	add	v0.4s,v0.4s,v1.4s
381	add	w5,w5,w9
382	add	v4.4s,v4.4s,v5.4s
383	add	w6,w6,w10
384	add	v16.4s,v16.4s,v17.4s
385	add	w7,w7,w11
386	eor	v3.16b,v3.16b,v0.16b
387	add	w8,w8,w12
388	eor	v7.16b,v7.16b,v4.16b
389	eor	w17,w17,w5
390	eor	v19.16b,v19.16b,v16.16b
391	eor	w19,w19,w6
392	rev32	v3.8h,v3.8h
393	eor	w20,w20,w7
394	rev32	v7.8h,v7.8h
395	eor	w21,w21,w8
396	rev32	v19.8h,v19.8h
397	ror	w17,w17,#16
398	add	v2.4s,v2.4s,v3.4s
399	ror	w19,w19,#16
400	add	v6.4s,v6.4s,v7.4s
401	ror	w20,w20,#16
402	add	v18.4s,v18.4s,v19.4s
403	ror	w21,w21,#16
404	eor	v20.16b,v1.16b,v2.16b
405	add	w13,w13,w17
406	eor	v21.16b,v5.16b,v6.16b
407	add	w14,w14,w19
408	eor	v22.16b,v17.16b,v18.16b
409	add	w15,w15,w20
410	ushr	v1.4s,v20.4s,#20
411	add	w16,w16,w21
412	ushr	v5.4s,v21.4s,#20
413	eor	w9,w9,w13
414	ushr	v17.4s,v22.4s,#20
415	eor	w10,w10,w14
416	sli	v1.4s,v20.4s,#12
417	eor	w11,w11,w15
418	sli	v5.4s,v21.4s,#12
419	eor	w12,w12,w16
420	sli	v17.4s,v22.4s,#12
421	ror	w9,w9,#20
422	add	v0.4s,v0.4s,v1.4s
423	ror	w10,w10,#20
424	add	v4.4s,v4.4s,v5.4s
425	ror	w11,w11,#20
426	add	v16.4s,v16.4s,v17.4s
427	ror	w12,w12,#20
428	eor	v20.16b,v3.16b,v0.16b
429	add	w5,w5,w9
430	eor	v21.16b,v7.16b,v4.16b
431	add	w6,w6,w10
432	eor	v22.16b,v19.16b,v16.16b
433	add	w7,w7,w11
434	ushr	v3.4s,v20.4s,#24
435	add	w8,w8,w12
436	ushr	v7.4s,v21.4s,#24
437	eor	w17,w17,w5
438	ushr	v19.4s,v22.4s,#24
439	eor	w19,w19,w6
440	sli	v3.4s,v20.4s,#8
441	eor	w20,w20,w7
442	sli	v7.4s,v21.4s,#8
443	eor	w21,w21,w8
444	sli	v19.4s,v22.4s,#8
445	ror	w17,w17,#24
446	add	v2.4s,v2.4s,v3.4s
447	ror	w19,w19,#24
448	add	v6.4s,v6.4s,v7.4s
449	ror	w20,w20,#24
450	add	v18.4s,v18.4s,v19.4s
451	ror	w21,w21,#24
452	eor	v20.16b,v1.16b,v2.16b
453	add	w13,w13,w17
454	eor	v21.16b,v5.16b,v6.16b
455	add	w14,w14,w19
456	eor	v22.16b,v17.16b,v18.16b
457	add	w15,w15,w20
458	ushr	v1.4s,v20.4s,#25
459	add	w16,w16,w21
460	ushr	v5.4s,v21.4s,#25
461	eor	w9,w9,w13
462	ushr	v17.4s,v22.4s,#25
463	eor	w10,w10,w14
464	sli	v1.4s,v20.4s,#7
465	eor	w11,w11,w15
466	sli	v5.4s,v21.4s,#7
467	eor	w12,w12,w16
468	sli	v17.4s,v22.4s,#7
469	ror	w9,w9,#25
470	ext	v2.16b,v2.16b,v2.16b,#8
471	ror	w10,w10,#25
472	ext	v6.16b,v6.16b,v6.16b,#8
473	ror	w11,w11,#25
474	ext	v18.16b,v18.16b,v18.16b,#8
475	ror	w12,w12,#25
476	ext	v3.16b,v3.16b,v3.16b,#12
477	ext	v7.16b,v7.16b,v7.16b,#12
478	ext	v19.16b,v19.16b,v19.16b,#12
479	ext	v1.16b,v1.16b,v1.16b,#4
480	ext	v5.16b,v5.16b,v5.16b,#4
481	ext	v17.16b,v17.16b,v17.16b,#4
482	add	v0.4s,v0.4s,v1.4s
483	add	w5,w5,w10
484	add	v4.4s,v4.4s,v5.4s
485	add	w6,w6,w11
486	add	v16.4s,v16.4s,v17.4s
487	add	w7,w7,w12
488	eor	v3.16b,v3.16b,v0.16b
489	add	w8,w8,w9
490	eor	v7.16b,v7.16b,v4.16b
491	eor	w21,w21,w5
492	eor	v19.16b,v19.16b,v16.16b
493	eor	w17,w17,w6
494	rev32	v3.8h,v3.8h
495	eor	w19,w19,w7
496	rev32	v7.8h,v7.8h
497	eor	w20,w20,w8
498	rev32	v19.8h,v19.8h
499	ror	w21,w21,#16
500	add	v2.4s,v2.4s,v3.4s
501	ror	w17,w17,#16
502	add	v6.4s,v6.4s,v7.4s
503	ror	w19,w19,#16
504	add	v18.4s,v18.4s,v19.4s
505	ror	w20,w20,#16
506	eor	v20.16b,v1.16b,v2.16b
507	add	w15,w15,w21
508	eor	v21.16b,v5.16b,v6.16b
509	add	w16,w16,w17
510	eor	v22.16b,v17.16b,v18.16b
511	add	w13,w13,w19
512	ushr	v1.4s,v20.4s,#20
513	add	w14,w14,w20
514	ushr	v5.4s,v21.4s,#20
515	eor	w10,w10,w15
516	ushr	v17.4s,v22.4s,#20
517	eor	w11,w11,w16
518	sli	v1.4s,v20.4s,#12
519	eor	w12,w12,w13
520	sli	v5.4s,v21.4s,#12
521	eor	w9,w9,w14
522	sli	v17.4s,v22.4s,#12
523	ror	w10,w10,#20
524	add	v0.4s,v0.4s,v1.4s
525	ror	w11,w11,#20
526	add	v4.4s,v4.4s,v5.4s
527	ror	w12,w12,#20
528	add	v16.4s,v16.4s,v17.4s
529	ror	w9,w9,#20
530	eor	v20.16b,v3.16b,v0.16b
531	add	w5,w5,w10
532	eor	v21.16b,v7.16b,v4.16b
533	add	w6,w6,w11
534	eor	v22.16b,v19.16b,v16.16b
535	add	w7,w7,w12
536	ushr	v3.4s,v20.4s,#24
537	add	w8,w8,w9
538	ushr	v7.4s,v21.4s,#24
539	eor	w21,w21,w5
540	ushr	v19.4s,v22.4s,#24
541	eor	w17,w17,w6
542	sli	v3.4s,v20.4s,#8
543	eor	w19,w19,w7
544	sli	v7.4s,v21.4s,#8
545	eor	w20,w20,w8
546	sli	v19.4s,v22.4s,#8
547	ror	w21,w21,#24
548	add	v2.4s,v2.4s,v3.4s
549	ror	w17,w17,#24
550	add	v6.4s,v6.4s,v7.4s
551	ror	w19,w19,#24
552	add	v18.4s,v18.4s,v19.4s
553	ror	w20,w20,#24
554	eor	v20.16b,v1.16b,v2.16b
555	add	w15,w15,w21
556	eor	v21.16b,v5.16b,v6.16b
557	add	w16,w16,w17
558	eor	v22.16b,v17.16b,v18.16b
559	add	w13,w13,w19
560	ushr	v1.4s,v20.4s,#25
561	add	w14,w14,w20
562	ushr	v5.4s,v21.4s,#25
563	eor	w10,w10,w15
564	ushr	v17.4s,v22.4s,#25
565	eor	w11,w11,w16
566	sli	v1.4s,v20.4s,#7
567	eor	w12,w12,w13
568	sli	v5.4s,v21.4s,#7
569	eor	w9,w9,w14
570	sli	v17.4s,v22.4s,#7
571	ror	w10,w10,#25
572	ext	v2.16b,v2.16b,v2.16b,#8
573	ror	w11,w11,#25
574	ext	v6.16b,v6.16b,v6.16b,#8
575	ror	w12,w12,#25
576	ext	v18.16b,v18.16b,v18.16b,#8
577	ror	w9,w9,#25
578	ext	v3.16b,v3.16b,v3.16b,#4
579	ext	v7.16b,v7.16b,v7.16b,#4
580	ext	v19.16b,v19.16b,v19.16b,#4
581	ext	v1.16b,v1.16b,v1.16b,#12
582	ext	v5.16b,v5.16b,v5.16b,#12
583	ext	v17.16b,v17.16b,v17.16b,#12
584	cbnz	x4,Loop_neon
585
586	add	w5,w5,w22		// accumulate key block
587	add	v0.4s,v0.4s,v24.4s
588	add	x6,x6,x22,lsr#32
589	add	v4.4s,v4.4s,v24.4s
590	add	w7,w7,w23
591	add	v16.4s,v16.4s,v24.4s
592	add	x8,x8,x23,lsr#32
593	add	v2.4s,v2.4s,v26.4s
594	add	w9,w9,w24
595	add	v6.4s,v6.4s,v26.4s
596	add	x10,x10,x24,lsr#32
597	add	v18.4s,v18.4s,v26.4s
598	add	w11,w11,w25
599	add	v3.4s,v3.4s,v27.4s
600	add	x12,x12,x25,lsr#32
601	add	w13,w13,w26
602	add	v7.4s,v7.4s,v28.4s
603	add	x14,x14,x26,lsr#32
604	add	w15,w15,w27
605	add	v19.4s,v19.4s,v29.4s
606	add	x16,x16,x27,lsr#32
607	add	w17,w17,w28
608	add	v1.4s,v1.4s,v25.4s
609	add	x19,x19,x28,lsr#32
610	add	w20,w20,w30
611	add	v5.4s,v5.4s,v25.4s
612	add	x21,x21,x30,lsr#32
613	add	v17.4s,v17.4s,v25.4s
614
615	b.lo	Ltail_neon
616
617	add	x5,x5,x6,lsl#32	// pack
618	add	x7,x7,x8,lsl#32
619	ldp	x6,x8,[x1,#0]		// load input
620	add	x9,x9,x10,lsl#32
621	add	x11,x11,x12,lsl#32
622	ldp	x10,x12,[x1,#16]
623	add	x13,x13,x14,lsl#32
624	add	x15,x15,x16,lsl#32
625	ldp	x14,x16,[x1,#32]
626	add	x17,x17,x19,lsl#32
627	add	x20,x20,x21,lsl#32
628	ldp	x19,x21,[x1,#48]
629	add	x1,x1,#64
630#ifdef	__AARCH64EB__
631	rev	x5,x5
632	rev	x7,x7
633	rev	x9,x9
634	rev	x11,x11
635	rev	x13,x13
636	rev	x15,x15
637	rev	x17,x17
638	rev	x20,x20
639#endif
640	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
641	eor	x5,x5,x6
642	eor	x7,x7,x8
643	eor	x9,x9,x10
644	eor	x11,x11,x12
645	eor	x13,x13,x14
646	eor	v0.16b,v0.16b,v20.16b
647	eor	x15,x15,x16
648	eor	v1.16b,v1.16b,v21.16b
649	eor	x17,x17,x19
650	eor	v2.16b,v2.16b,v22.16b
651	eor	x20,x20,x21
652	eor	v3.16b,v3.16b,v23.16b
653	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
654
655	stp	x5,x7,[x0,#0]		// store output
656	add	x28,x28,#4			// increment counter
657	stp	x9,x11,[x0,#16]
658	add	v27.4s,v27.4s,v31.4s		// += 4
659	stp	x13,x15,[x0,#32]
660	add	v28.4s,v28.4s,v31.4s
661	stp	x17,x20,[x0,#48]
662	add	v29.4s,v29.4s,v31.4s
663	add	x0,x0,#64
664
665	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
666	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
667
668	eor	v4.16b,v4.16b,v20.16b
669	eor	v5.16b,v5.16b,v21.16b
670	eor	v6.16b,v6.16b,v22.16b
671	eor	v7.16b,v7.16b,v23.16b
672	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
673
674	eor	v16.16b,v16.16b,v0.16b
675	eor	v17.16b,v17.16b,v1.16b
676	eor	v18.16b,v18.16b,v2.16b
677	eor	v19.16b,v19.16b,v3.16b
678	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
679
680	b.hi	Loop_outer_neon
681
682	ldp	x19,x20,[x29,#16]
683	add	sp,sp,#64
684	ldp	x21,x22,[x29,#32]
685	ldp	x23,x24,[x29,#48]
686	ldp	x25,x26,[x29,#64]
687	ldp	x27,x28,[x29,#80]
688	ldp	x29,x30,[sp],#96
689	AARCH64_VALIDATE_LINK_REGISTER
690	ret
691
692Ltail_neon:
693	add	x2,x2,#256
694	cmp	x2,#64
695	b.lo	Less_than_64
696
697	add	x5,x5,x6,lsl#32	// pack
698	add	x7,x7,x8,lsl#32
699	ldp	x6,x8,[x1,#0]		// load input
700	add	x9,x9,x10,lsl#32
701	add	x11,x11,x12,lsl#32
702	ldp	x10,x12,[x1,#16]
703	add	x13,x13,x14,lsl#32
704	add	x15,x15,x16,lsl#32
705	ldp	x14,x16,[x1,#32]
706	add	x17,x17,x19,lsl#32
707	add	x20,x20,x21,lsl#32
708	ldp	x19,x21,[x1,#48]
709	add	x1,x1,#64
710#ifdef	__AARCH64EB__
711	rev	x5,x5
712	rev	x7,x7
713	rev	x9,x9
714	rev	x11,x11
715	rev	x13,x13
716	rev	x15,x15
717	rev	x17,x17
718	rev	x20,x20
719#endif
720	eor	x5,x5,x6
721	eor	x7,x7,x8
722	eor	x9,x9,x10
723	eor	x11,x11,x12
724	eor	x13,x13,x14
725	eor	x15,x15,x16
726	eor	x17,x17,x19
727	eor	x20,x20,x21
728
729	stp	x5,x7,[x0,#0]		// store output
730	add	x28,x28,#4			// increment counter
731	stp	x9,x11,[x0,#16]
732	stp	x13,x15,[x0,#32]
733	stp	x17,x20,[x0,#48]
734	add	x0,x0,#64
735	b.eq	Ldone_neon
736	sub	x2,x2,#64
737	cmp	x2,#64
738	b.lo	Less_than_128
739
740	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
741	eor	v0.16b,v0.16b,v20.16b
742	eor	v1.16b,v1.16b,v21.16b
743	eor	v2.16b,v2.16b,v22.16b
744	eor	v3.16b,v3.16b,v23.16b
745	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
746	b.eq	Ldone_neon
747	sub	x2,x2,#64
748	cmp	x2,#64
749	b.lo	Less_than_192
750
751	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
752	eor	v4.16b,v4.16b,v20.16b
753	eor	v5.16b,v5.16b,v21.16b
754	eor	v6.16b,v6.16b,v22.16b
755	eor	v7.16b,v7.16b,v23.16b
756	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
757	b.eq	Ldone_neon
758	sub	x2,x2,#64
759
760	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
761	b	Last_neon
762
763Less_than_128:
764	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
765	b	Last_neon
766Less_than_192:
767	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
768	b	Last_neon
769
770.align	4
771Last_neon:
772	sub	x0,x0,#1
773	add	x1,x1,x2
774	add	x0,x0,x2
775	add	x4,sp,x2
776	neg	x2,x2
777
778Loop_tail_neon:
779	ldrb	w10,[x1,x2]
780	ldrb	w11,[x4,x2]
781	add	x2,x2,#1
782	eor	w10,w10,w11
783	strb	w10,[x0,x2]
784	cbnz	x2,Loop_tail_neon
785
786	stp	xzr,xzr,[sp,#0]
787	stp	xzr,xzr,[sp,#16]
788	stp	xzr,xzr,[sp,#32]
789	stp	xzr,xzr,[sp,#48]
790
791Ldone_neon:
792	ldp	x19,x20,[x29,#16]
793	add	sp,sp,#64
794	ldp	x21,x22,[x29,#32]
795	ldp	x23,x24,[x29,#48]
796	ldp	x25,x26,[x29,#64]
797	ldp	x27,x28,[x29,#80]
798	ldp	x29,x30,[sp],#96
799	AARCH64_VALIDATE_LINK_REGISTER
800	ret
801
802.def ChaCha20_512_neon
803   .type 32
804.endef
805.align	5
806ChaCha20_512_neon:
807	AARCH64_SIGN_LINK_REGISTER
808	stp	x29,x30,[sp,#-96]!
809	add	x29,sp,#0
810
811	adrp	x5,Lsigma
812	add	x5,x5,:lo12:Lsigma
813	stp	x19,x20,[sp,#16]
814	stp	x21,x22,[sp,#32]
815	stp	x23,x24,[sp,#48]
816	stp	x25,x26,[sp,#64]
817	stp	x27,x28,[sp,#80]
818
819L512_or_more_neon:
820	sub	sp,sp,#128+64
821
822	ldp	x22,x23,[x5]		// load sigma
823	ld1	{v24.4s},[x5],#16
824	ldp	x24,x25,[x3]		// load key
825	ldp	x26,x27,[x3,#16]
826	ld1	{v25.4s,v26.4s},[x3]
827	ldp	x28,x30,[x4]		// load counter
828	ld1	{v27.4s},[x4]
829	ld1	{v31.4s},[x5]
830#ifdef	__AARCH64EB__
831	rev64	v24.4s,v24.4s
832	ror	x24,x24,#32
833	ror	x25,x25,#32
834	ror	x26,x26,#32
835	ror	x27,x27,#32
836	ror	x28,x28,#32
837	ror	x30,x30,#32
838#endif
839	add	v27.4s,v27.4s,v31.4s		// += 1
840	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
841	add	v27.4s,v27.4s,v31.4s		// not typo
842	str	q26,[sp,#32]
843	add	v28.4s,v27.4s,v31.4s
844	add	v29.4s,v28.4s,v31.4s
845	add	v30.4s,v29.4s,v31.4s
846	shl	v31.4s,v31.4s,#2			// 1 -> 4
847
848	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
849	stp	d10,d11,[sp,#128+16]
850	stp	d12,d13,[sp,#128+32]
851	stp	d14,d15,[sp,#128+48]
852
853	sub	x2,x2,#512			// not typo
854
855Loop_outer_512_neon:
856	mov	v0.16b,v24.16b
857	mov	v4.16b,v24.16b
858	mov	v8.16b,v24.16b
859	mov	v12.16b,v24.16b
860	mov	v16.16b,v24.16b
861	mov	v20.16b,v24.16b
862	mov	v1.16b,v25.16b
863	mov	w5,w22			// unpack key block
864	mov	v5.16b,v25.16b
865	lsr	x6,x22,#32
866	mov	v9.16b,v25.16b
867	mov	w7,w23
868	mov	v13.16b,v25.16b
869	lsr	x8,x23,#32
870	mov	v17.16b,v25.16b
871	mov	w9,w24
872	mov	v21.16b,v25.16b
873	lsr	x10,x24,#32
874	mov	v3.16b,v27.16b
875	mov	w11,w25
876	mov	v7.16b,v28.16b
877	lsr	x12,x25,#32
878	mov	v11.16b,v29.16b
879	mov	w13,w26
880	mov	v15.16b,v30.16b
881	lsr	x14,x26,#32
882	mov	v2.16b,v26.16b
883	mov	w15,w27
884	mov	v6.16b,v26.16b
885	lsr	x16,x27,#32
886	add	v19.4s,v3.4s,v31.4s			// +4
887	mov	w17,w28
888	add	v23.4s,v7.4s,v31.4s			// +4
889	lsr	x19,x28,#32
890	mov	v10.16b,v26.16b
891	mov	w20,w30
892	mov	v14.16b,v26.16b
893	lsr	x21,x30,#32
894	mov	v18.16b,v26.16b
895	stp	q27,q28,[sp,#48]		// off-load key block, variable part
896	mov	v22.16b,v26.16b
897	str	q29,[sp,#80]
898
899	mov	x4,#5
900	subs	x2,x2,#512
901Loop_upper_neon:
902	sub	x4,x4,#1
903	add	v0.4s,v0.4s,v1.4s
904	add	w5,w5,w9
905	add	v4.4s,v4.4s,v5.4s
906	add	w6,w6,w10
907	add	v8.4s,v8.4s,v9.4s
908	add	w7,w7,w11
909	add	v12.4s,v12.4s,v13.4s
910	add	w8,w8,w12
911	add	v16.4s,v16.4s,v17.4s
912	eor	w17,w17,w5
913	add	v20.4s,v20.4s,v21.4s
914	eor	w19,w19,w6
915	eor	v3.16b,v3.16b,v0.16b
916	eor	w20,w20,w7
917	eor	v7.16b,v7.16b,v4.16b
918	eor	w21,w21,w8
919	eor	v11.16b,v11.16b,v8.16b
920	ror	w17,w17,#16
921	eor	v15.16b,v15.16b,v12.16b
922	ror	w19,w19,#16
923	eor	v19.16b,v19.16b,v16.16b
924	ror	w20,w20,#16
925	eor	v23.16b,v23.16b,v20.16b
926	ror	w21,w21,#16
927	rev32	v3.8h,v3.8h
928	add	w13,w13,w17
929	rev32	v7.8h,v7.8h
930	add	w14,w14,w19
931	rev32	v11.8h,v11.8h
932	add	w15,w15,w20
933	rev32	v15.8h,v15.8h
934	add	w16,w16,w21
935	rev32	v19.8h,v19.8h
936	eor	w9,w9,w13
937	rev32	v23.8h,v23.8h
938	eor	w10,w10,w14
939	add	v2.4s,v2.4s,v3.4s
940	eor	w11,w11,w15
941	add	v6.4s,v6.4s,v7.4s
942	eor	w12,w12,w16
943	add	v10.4s,v10.4s,v11.4s
944	ror	w9,w9,#20
945	add	v14.4s,v14.4s,v15.4s
946	ror	w10,w10,#20
947	add	v18.4s,v18.4s,v19.4s
948	ror	w11,w11,#20
949	add	v22.4s,v22.4s,v23.4s
950	ror	w12,w12,#20
951	eor	v24.16b,v1.16b,v2.16b
952	add	w5,w5,w9
953	eor	v25.16b,v5.16b,v6.16b
954	add	w6,w6,w10
955	eor	v26.16b,v9.16b,v10.16b
956	add	w7,w7,w11
957	eor	v27.16b,v13.16b,v14.16b
958	add	w8,w8,w12
959	eor	v28.16b,v17.16b,v18.16b
960	eor	w17,w17,w5
961	eor	v29.16b,v21.16b,v22.16b
962	eor	w19,w19,w6
963	ushr	v1.4s,v24.4s,#20
964	eor	w20,w20,w7
965	ushr	v5.4s,v25.4s,#20
966	eor	w21,w21,w8
967	ushr	v9.4s,v26.4s,#20
968	ror	w17,w17,#24
969	ushr	v13.4s,v27.4s,#20
970	ror	w19,w19,#24
971	ushr	v17.4s,v28.4s,#20
972	ror	w20,w20,#24
973	ushr	v21.4s,v29.4s,#20
974	ror	w21,w21,#24
975	sli	v1.4s,v24.4s,#12
976	add	w13,w13,w17
977	sli	v5.4s,v25.4s,#12
978	add	w14,w14,w19
979	sli	v9.4s,v26.4s,#12
980	add	w15,w15,w20
981	sli	v13.4s,v27.4s,#12
982	add	w16,w16,w21
983	sli	v17.4s,v28.4s,#12
984	eor	w9,w9,w13
985	sli	v21.4s,v29.4s,#12
986	eor	w10,w10,w14
987	add	v0.4s,v0.4s,v1.4s
988	eor	w11,w11,w15
989	add	v4.4s,v4.4s,v5.4s
990	eor	w12,w12,w16
991	add	v8.4s,v8.4s,v9.4s
992	ror	w9,w9,#25
993	add	v12.4s,v12.4s,v13.4s
994	ror	w10,w10,#25
995	add	v16.4s,v16.4s,v17.4s
996	ror	w11,w11,#25
997	add	v20.4s,v20.4s,v21.4s
998	ror	w12,w12,#25
999	eor	v24.16b,v3.16b,v0.16b
1000	add	w5,w5,w10
1001	eor	v25.16b,v7.16b,v4.16b
1002	add	w6,w6,w11
1003	eor	v26.16b,v11.16b,v8.16b
1004	add	w7,w7,w12
1005	eor	v27.16b,v15.16b,v12.16b
1006	add	w8,w8,w9
1007	eor	v28.16b,v19.16b,v16.16b
1008	eor	w21,w21,w5
1009	eor	v29.16b,v23.16b,v20.16b
1010	eor	w17,w17,w6
1011	ushr	v3.4s,v24.4s,#24
1012	eor	w19,w19,w7
1013	ushr	v7.4s,v25.4s,#24
1014	eor	w20,w20,w8
1015	ushr	v11.4s,v26.4s,#24
1016	ror	w21,w21,#16
1017	ushr	v15.4s,v27.4s,#24
1018	ror	w17,w17,#16
1019	ushr	v19.4s,v28.4s,#24
1020	ror	w19,w19,#16
1021	ushr	v23.4s,v29.4s,#24
1022	ror	w20,w20,#16
1023	sli	v3.4s,v24.4s,#8
1024	add	w15,w15,w21
1025	sli	v7.4s,v25.4s,#8
1026	add	w16,w16,w17
1027	sli	v11.4s,v26.4s,#8
1028	add	w13,w13,w19
1029	sli	v15.4s,v27.4s,#8
1030	add	w14,w14,w20
1031	sli	v19.4s,v28.4s,#8
1032	eor	w10,w10,w15
1033	sli	v23.4s,v29.4s,#8
1034	eor	w11,w11,w16
1035	add	v2.4s,v2.4s,v3.4s
1036	eor	w12,w12,w13
1037	add	v6.4s,v6.4s,v7.4s
1038	eor	w9,w9,w14
1039	add	v10.4s,v10.4s,v11.4s
1040	ror	w10,w10,#20
1041	add	v14.4s,v14.4s,v15.4s
1042	ror	w11,w11,#20
1043	add	v18.4s,v18.4s,v19.4s
1044	ror	w12,w12,#20
1045	add	v22.4s,v22.4s,v23.4s
1046	ror	w9,w9,#20
1047	eor	v24.16b,v1.16b,v2.16b
1048	add	w5,w5,w10
1049	eor	v25.16b,v5.16b,v6.16b
1050	add	w6,w6,w11
1051	eor	v26.16b,v9.16b,v10.16b
1052	add	w7,w7,w12
1053	eor	v27.16b,v13.16b,v14.16b
1054	add	w8,w8,w9
1055	eor	v28.16b,v17.16b,v18.16b
1056	eor	w21,w21,w5
1057	eor	v29.16b,v21.16b,v22.16b
1058	eor	w17,w17,w6
1059	ushr	v1.4s,v24.4s,#25
1060	eor	w19,w19,w7
1061	ushr	v5.4s,v25.4s,#25
1062	eor	w20,w20,w8
1063	ushr	v9.4s,v26.4s,#25
1064	ror	w21,w21,#24
1065	ushr	v13.4s,v27.4s,#25
1066	ror	w17,w17,#24
1067	ushr	v17.4s,v28.4s,#25
1068	ror	w19,w19,#24
1069	ushr	v21.4s,v29.4s,#25
1070	ror	w20,w20,#24
1071	sli	v1.4s,v24.4s,#7
1072	add	w15,w15,w21
1073	sli	v5.4s,v25.4s,#7
1074	add	w16,w16,w17
1075	sli	v9.4s,v26.4s,#7
1076	add	w13,w13,w19
1077	sli	v13.4s,v27.4s,#7
1078	add	w14,w14,w20
1079	sli	v17.4s,v28.4s,#7
1080	eor	w10,w10,w15
1081	sli	v21.4s,v29.4s,#7
1082	eor	w11,w11,w16
1083	ext	v2.16b,v2.16b,v2.16b,#8
1084	eor	w12,w12,w13
1085	ext	v6.16b,v6.16b,v6.16b,#8
1086	eor	w9,w9,w14
1087	ext	v10.16b,v10.16b,v10.16b,#8
1088	ror	w10,w10,#25
1089	ext	v14.16b,v14.16b,v14.16b,#8
1090	ror	w11,w11,#25
1091	ext	v18.16b,v18.16b,v18.16b,#8
1092	ror	w12,w12,#25
1093	ext	v22.16b,v22.16b,v22.16b,#8
1094	ror	w9,w9,#25
1095	ext	v3.16b,v3.16b,v3.16b,#12
1096	ext	v7.16b,v7.16b,v7.16b,#12
1097	ext	v11.16b,v11.16b,v11.16b,#12
1098	ext	v15.16b,v15.16b,v15.16b,#12
1099	ext	v19.16b,v19.16b,v19.16b,#12
1100	ext	v23.16b,v23.16b,v23.16b,#12
1101	ext	v1.16b,v1.16b,v1.16b,#4
1102	ext	v5.16b,v5.16b,v5.16b,#4
1103	ext	v9.16b,v9.16b,v9.16b,#4
1104	ext	v13.16b,v13.16b,v13.16b,#4
1105	ext	v17.16b,v17.16b,v17.16b,#4
1106	ext	v21.16b,v21.16b,v21.16b,#4
1107	add	v0.4s,v0.4s,v1.4s
1108	add	w5,w5,w9
1109	add	v4.4s,v4.4s,v5.4s
1110	add	w6,w6,w10
1111	add	v8.4s,v8.4s,v9.4s
1112	add	w7,w7,w11
1113	add	v12.4s,v12.4s,v13.4s
1114	add	w8,w8,w12
1115	add	v16.4s,v16.4s,v17.4s
1116	eor	w17,w17,w5
1117	add	v20.4s,v20.4s,v21.4s
1118	eor	w19,w19,w6
1119	eor	v3.16b,v3.16b,v0.16b
1120	eor	w20,w20,w7
1121	eor	v7.16b,v7.16b,v4.16b
1122	eor	w21,w21,w8
1123	eor	v11.16b,v11.16b,v8.16b
1124	ror	w17,w17,#16
1125	eor	v15.16b,v15.16b,v12.16b
1126	ror	w19,w19,#16
1127	eor	v19.16b,v19.16b,v16.16b
1128	ror	w20,w20,#16
1129	eor	v23.16b,v23.16b,v20.16b
1130	ror	w21,w21,#16
1131	rev32	v3.8h,v3.8h
1132	add	w13,w13,w17
1133	rev32	v7.8h,v7.8h
1134	add	w14,w14,w19
1135	rev32	v11.8h,v11.8h
1136	add	w15,w15,w20
1137	rev32	v15.8h,v15.8h
1138	add	w16,w16,w21
1139	rev32	v19.8h,v19.8h
1140	eor	w9,w9,w13
1141	rev32	v23.8h,v23.8h
1142	eor	w10,w10,w14
1143	add	v2.4s,v2.4s,v3.4s
1144	eor	w11,w11,w15
1145	add	v6.4s,v6.4s,v7.4s
1146	eor	w12,w12,w16
1147	add	v10.4s,v10.4s,v11.4s
1148	ror	w9,w9,#20
1149	add	v14.4s,v14.4s,v15.4s
1150	ror	w10,w10,#20
1151	add	v18.4s,v18.4s,v19.4s
1152	ror	w11,w11,#20
1153	add	v22.4s,v22.4s,v23.4s
1154	ror	w12,w12,#20
1155	eor	v24.16b,v1.16b,v2.16b
1156	add	w5,w5,w9
1157	eor	v25.16b,v5.16b,v6.16b
1158	add	w6,w6,w10
1159	eor	v26.16b,v9.16b,v10.16b
1160	add	w7,w7,w11
1161	eor	v27.16b,v13.16b,v14.16b
1162	add	w8,w8,w12
1163	eor	v28.16b,v17.16b,v18.16b
1164	eor	w17,w17,w5
1165	eor	v29.16b,v21.16b,v22.16b
1166	eor	w19,w19,w6
1167	ushr	v1.4s,v24.4s,#20
1168	eor	w20,w20,w7
1169	ushr	v5.4s,v25.4s,#20
1170	eor	w21,w21,w8
1171	ushr	v9.4s,v26.4s,#20
1172	ror	w17,w17,#24
1173	ushr	v13.4s,v27.4s,#20
1174	ror	w19,w19,#24
1175	ushr	v17.4s,v28.4s,#20
1176	ror	w20,w20,#24
1177	ushr	v21.4s,v29.4s,#20
1178	ror	w21,w21,#24
1179	sli	v1.4s,v24.4s,#12
1180	add	w13,w13,w17
1181	sli	v5.4s,v25.4s,#12
1182	add	w14,w14,w19
1183	sli	v9.4s,v26.4s,#12
1184	add	w15,w15,w20
1185	sli	v13.4s,v27.4s,#12
1186	add	w16,w16,w21
1187	sli	v17.4s,v28.4s,#12
1188	eor	w9,w9,w13
1189	sli	v21.4s,v29.4s,#12
1190	eor	w10,w10,w14
1191	add	v0.4s,v0.4s,v1.4s
1192	eor	w11,w11,w15
1193	add	v4.4s,v4.4s,v5.4s
1194	eor	w12,w12,w16
1195	add	v8.4s,v8.4s,v9.4s
1196	ror	w9,w9,#25
1197	add	v12.4s,v12.4s,v13.4s
1198	ror	w10,w10,#25
1199	add	v16.4s,v16.4s,v17.4s
1200	ror	w11,w11,#25
1201	add	v20.4s,v20.4s,v21.4s
1202	ror	w12,w12,#25
1203	eor	v24.16b,v3.16b,v0.16b
1204	add	w5,w5,w10
1205	eor	v25.16b,v7.16b,v4.16b
1206	add	w6,w6,w11
1207	eor	v26.16b,v11.16b,v8.16b
1208	add	w7,w7,w12
1209	eor	v27.16b,v15.16b,v12.16b
1210	add	w8,w8,w9
1211	eor	v28.16b,v19.16b,v16.16b
1212	eor	w21,w21,w5
1213	eor	v29.16b,v23.16b,v20.16b
1214	eor	w17,w17,w6
1215	ushr	v3.4s,v24.4s,#24
1216	eor	w19,w19,w7
1217	ushr	v7.4s,v25.4s,#24
1218	eor	w20,w20,w8
1219	ushr	v11.4s,v26.4s,#24
1220	ror	w21,w21,#16
1221	ushr	v15.4s,v27.4s,#24
1222	ror	w17,w17,#16
1223	ushr	v19.4s,v28.4s,#24
1224	ror	w19,w19,#16
1225	ushr	v23.4s,v29.4s,#24
1226	ror	w20,w20,#16
1227	sli	v3.4s,v24.4s,#8
1228	add	w15,w15,w21
1229	sli	v7.4s,v25.4s,#8
1230	add	w16,w16,w17
1231	sli	v11.4s,v26.4s,#8
1232	add	w13,w13,w19
1233	sli	v15.4s,v27.4s,#8
1234	add	w14,w14,w20
1235	sli	v19.4s,v28.4s,#8
1236	eor	w10,w10,w15
1237	sli	v23.4s,v29.4s,#8
1238	eor	w11,w11,w16
1239	add	v2.4s,v2.4s,v3.4s
1240	eor	w12,w12,w13
1241	add	v6.4s,v6.4s,v7.4s
1242	eor	w9,w9,w14
1243	add	v10.4s,v10.4s,v11.4s
1244	ror	w10,w10,#20
1245	add	v14.4s,v14.4s,v15.4s
1246	ror	w11,w11,#20
1247	add	v18.4s,v18.4s,v19.4s
1248	ror	w12,w12,#20
1249	add	v22.4s,v22.4s,v23.4s
1250	ror	w9,w9,#20
1251	eor	v24.16b,v1.16b,v2.16b
1252	add	w5,w5,w10
1253	eor	v25.16b,v5.16b,v6.16b
1254	add	w6,w6,w11
1255	eor	v26.16b,v9.16b,v10.16b
1256	add	w7,w7,w12
1257	eor	v27.16b,v13.16b,v14.16b
1258	add	w8,w8,w9
1259	eor	v28.16b,v17.16b,v18.16b
1260	eor	w21,w21,w5
1261	eor	v29.16b,v21.16b,v22.16b
1262	eor	w17,w17,w6
1263	ushr	v1.4s,v24.4s,#25
1264	eor	w19,w19,w7
1265	ushr	v5.4s,v25.4s,#25
1266	eor	w20,w20,w8
1267	ushr	v9.4s,v26.4s,#25
1268	ror	w21,w21,#24
1269	ushr	v13.4s,v27.4s,#25
1270	ror	w17,w17,#24
1271	ushr	v17.4s,v28.4s,#25
1272	ror	w19,w19,#24
1273	ushr	v21.4s,v29.4s,#25
1274	ror	w20,w20,#24
1275	sli	v1.4s,v24.4s,#7
1276	add	w15,w15,w21
1277	sli	v5.4s,v25.4s,#7
1278	add	w16,w16,w17
1279	sli	v9.4s,v26.4s,#7
1280	add	w13,w13,w19
1281	sli	v13.4s,v27.4s,#7
1282	add	w14,w14,w20
1283	sli	v17.4s,v28.4s,#7
1284	eor	w10,w10,w15
1285	sli	v21.4s,v29.4s,#7
1286	eor	w11,w11,w16
1287	ext	v2.16b,v2.16b,v2.16b,#8
1288	eor	w12,w12,w13
1289	ext	v6.16b,v6.16b,v6.16b,#8
1290	eor	w9,w9,w14
1291	ext	v10.16b,v10.16b,v10.16b,#8
1292	ror	w10,w10,#25
1293	ext	v14.16b,v14.16b,v14.16b,#8
1294	ror	w11,w11,#25
1295	ext	v18.16b,v18.16b,v18.16b,#8
1296	ror	w12,w12,#25
1297	ext	v22.16b,v22.16b,v22.16b,#8
1298	ror	w9,w9,#25
1299	ext	v3.16b,v3.16b,v3.16b,#4
1300	ext	v7.16b,v7.16b,v7.16b,#4
1301	ext	v11.16b,v11.16b,v11.16b,#4
1302	ext	v15.16b,v15.16b,v15.16b,#4
1303	ext	v19.16b,v19.16b,v19.16b,#4
1304	ext	v23.16b,v23.16b,v23.16b,#4
1305	ext	v1.16b,v1.16b,v1.16b,#12
1306	ext	v5.16b,v5.16b,v5.16b,#12
1307	ext	v9.16b,v9.16b,v9.16b,#12
1308	ext	v13.16b,v13.16b,v13.16b,#12
1309	ext	v17.16b,v17.16b,v17.16b,#12
1310	ext	v21.16b,v21.16b,v21.16b,#12
1311	cbnz	x4,Loop_upper_neon
1312
1313	add	w5,w5,w22		// accumulate key block
1314	add	x6,x6,x22,lsr#32
1315	add	w7,w7,w23
1316	add	x8,x8,x23,lsr#32
1317	add	w9,w9,w24
1318	add	x10,x10,x24,lsr#32
1319	add	w11,w11,w25
1320	add	x12,x12,x25,lsr#32
1321	add	w13,w13,w26
1322	add	x14,x14,x26,lsr#32
1323	add	w15,w15,w27
1324	add	x16,x16,x27,lsr#32
1325	add	w17,w17,w28
1326	add	x19,x19,x28,lsr#32
1327	add	w20,w20,w30
1328	add	x21,x21,x30,lsr#32
1329
1330	add	x5,x5,x6,lsl#32	// pack
1331	add	x7,x7,x8,lsl#32
1332	ldp	x6,x8,[x1,#0]		// load input
1333	add	x9,x9,x10,lsl#32
1334	add	x11,x11,x12,lsl#32
1335	ldp	x10,x12,[x1,#16]
1336	add	x13,x13,x14,lsl#32
1337	add	x15,x15,x16,lsl#32
1338	ldp	x14,x16,[x1,#32]
1339	add	x17,x17,x19,lsl#32
1340	add	x20,x20,x21,lsl#32
1341	ldp	x19,x21,[x1,#48]
1342	add	x1,x1,#64
1343#ifdef	__AARCH64EB__
1344	rev	x5,x5
1345	rev	x7,x7
1346	rev	x9,x9
1347	rev	x11,x11
1348	rev	x13,x13
1349	rev	x15,x15
1350	rev	x17,x17
1351	rev	x20,x20
1352#endif
1353	eor	x5,x5,x6
1354	eor	x7,x7,x8
1355	eor	x9,x9,x10
1356	eor	x11,x11,x12
1357	eor	x13,x13,x14
1358	eor	x15,x15,x16
1359	eor	x17,x17,x19
1360	eor	x20,x20,x21
1361
1362	stp	x5,x7,[x0,#0]		// store output
1363	add	x28,x28,#1			// increment counter
1364	mov	w5,w22			// unpack key block
1365	lsr	x6,x22,#32
1366	stp	x9,x11,[x0,#16]
1367	mov	w7,w23
1368	lsr	x8,x23,#32
1369	stp	x13,x15,[x0,#32]
1370	mov	w9,w24
1371	lsr	x10,x24,#32
1372	stp	x17,x20,[x0,#48]
1373	add	x0,x0,#64
1374	mov	w11,w25
1375	lsr	x12,x25,#32
1376	mov	w13,w26
1377	lsr	x14,x26,#32
1378	mov	w15,w27
1379	lsr	x16,x27,#32
1380	mov	w17,w28
1381	lsr	x19,x28,#32
1382	mov	w20,w30
1383	lsr	x21,x30,#32
1384
1385	mov	x4,#5
1386Loop_lower_neon:
1387	sub	x4,x4,#1
1388	add	v0.4s,v0.4s,v1.4s
1389	add	w5,w5,w9
1390	add	v4.4s,v4.4s,v5.4s
1391	add	w6,w6,w10
1392	add	v8.4s,v8.4s,v9.4s
1393	add	w7,w7,w11
1394	add	v12.4s,v12.4s,v13.4s
1395	add	w8,w8,w12
1396	add	v16.4s,v16.4s,v17.4s
1397	eor	w17,w17,w5
1398	add	v20.4s,v20.4s,v21.4s
1399	eor	w19,w19,w6
1400	eor	v3.16b,v3.16b,v0.16b
1401	eor	w20,w20,w7
1402	eor	v7.16b,v7.16b,v4.16b
1403	eor	w21,w21,w8
1404	eor	v11.16b,v11.16b,v8.16b
1405	ror	w17,w17,#16
1406	eor	v15.16b,v15.16b,v12.16b
1407	ror	w19,w19,#16
1408	eor	v19.16b,v19.16b,v16.16b
1409	ror	w20,w20,#16
1410	eor	v23.16b,v23.16b,v20.16b
1411	ror	w21,w21,#16
1412	rev32	v3.8h,v3.8h
1413	add	w13,w13,w17
1414	rev32	v7.8h,v7.8h
1415	add	w14,w14,w19
1416	rev32	v11.8h,v11.8h
1417	add	w15,w15,w20
1418	rev32	v15.8h,v15.8h
1419	add	w16,w16,w21
1420	rev32	v19.8h,v19.8h
1421	eor	w9,w9,w13
1422	rev32	v23.8h,v23.8h
1423	eor	w10,w10,w14
1424	add	v2.4s,v2.4s,v3.4s
1425	eor	w11,w11,w15
1426	add	v6.4s,v6.4s,v7.4s
1427	eor	w12,w12,w16
1428	add	v10.4s,v10.4s,v11.4s
1429	ror	w9,w9,#20
1430	add	v14.4s,v14.4s,v15.4s
1431	ror	w10,w10,#20
1432	add	v18.4s,v18.4s,v19.4s
1433	ror	w11,w11,#20
1434	add	v22.4s,v22.4s,v23.4s
1435	ror	w12,w12,#20
1436	eor	v24.16b,v1.16b,v2.16b
1437	add	w5,w5,w9
1438	eor	v25.16b,v5.16b,v6.16b
1439	add	w6,w6,w10
1440	eor	v26.16b,v9.16b,v10.16b
1441	add	w7,w7,w11
1442	eor	v27.16b,v13.16b,v14.16b
1443	add	w8,w8,w12
1444	eor	v28.16b,v17.16b,v18.16b
1445	eor	w17,w17,w5
1446	eor	v29.16b,v21.16b,v22.16b
1447	eor	w19,w19,w6
1448	ushr	v1.4s,v24.4s,#20
1449	eor	w20,w20,w7
1450	ushr	v5.4s,v25.4s,#20
1451	eor	w21,w21,w8
1452	ushr	v9.4s,v26.4s,#20
1453	ror	w17,w17,#24
1454	ushr	v13.4s,v27.4s,#20
1455	ror	w19,w19,#24
1456	ushr	v17.4s,v28.4s,#20
1457	ror	w20,w20,#24
1458	ushr	v21.4s,v29.4s,#20
1459	ror	w21,w21,#24
1460	sli	v1.4s,v24.4s,#12
1461	add	w13,w13,w17
1462	sli	v5.4s,v25.4s,#12
1463	add	w14,w14,w19
1464	sli	v9.4s,v26.4s,#12
1465	add	w15,w15,w20
1466	sli	v13.4s,v27.4s,#12
1467	add	w16,w16,w21
1468	sli	v17.4s,v28.4s,#12
1469	eor	w9,w9,w13
1470	sli	v21.4s,v29.4s,#12
1471	eor	w10,w10,w14
1472	add	v0.4s,v0.4s,v1.4s
1473	eor	w11,w11,w15
1474	add	v4.4s,v4.4s,v5.4s
1475	eor	w12,w12,w16
1476	add	v8.4s,v8.4s,v9.4s
1477	ror	w9,w9,#25
1478	add	v12.4s,v12.4s,v13.4s
1479	ror	w10,w10,#25
1480	add	v16.4s,v16.4s,v17.4s
1481	ror	w11,w11,#25
1482	add	v20.4s,v20.4s,v21.4s
1483	ror	w12,w12,#25
1484	eor	v24.16b,v3.16b,v0.16b
1485	add	w5,w5,w10
1486	eor	v25.16b,v7.16b,v4.16b
1487	add	w6,w6,w11
1488	eor	v26.16b,v11.16b,v8.16b
1489	add	w7,w7,w12
1490	eor	v27.16b,v15.16b,v12.16b
1491	add	w8,w8,w9
1492	eor	v28.16b,v19.16b,v16.16b
1493	eor	w21,w21,w5
1494	eor	v29.16b,v23.16b,v20.16b
1495	eor	w17,w17,w6
1496	ushr	v3.4s,v24.4s,#24
1497	eor	w19,w19,w7
1498	ushr	v7.4s,v25.4s,#24
1499	eor	w20,w20,w8
1500	ushr	v11.4s,v26.4s,#24
1501	ror	w21,w21,#16
1502	ushr	v15.4s,v27.4s,#24
1503	ror	w17,w17,#16
1504	ushr	v19.4s,v28.4s,#24
1505	ror	w19,w19,#16
1506	ushr	v23.4s,v29.4s,#24
1507	ror	w20,w20,#16
1508	sli	v3.4s,v24.4s,#8
1509	add	w15,w15,w21
1510	sli	v7.4s,v25.4s,#8
1511	add	w16,w16,w17
1512	sli	v11.4s,v26.4s,#8
1513	add	w13,w13,w19
1514	sli	v15.4s,v27.4s,#8
1515	add	w14,w14,w20
1516	sli	v19.4s,v28.4s,#8
1517	eor	w10,w10,w15
1518	sli	v23.4s,v29.4s,#8
1519	eor	w11,w11,w16
1520	add	v2.4s,v2.4s,v3.4s
1521	eor	w12,w12,w13
1522	add	v6.4s,v6.4s,v7.4s
1523	eor	w9,w9,w14
1524	add	v10.4s,v10.4s,v11.4s
1525	ror	w10,w10,#20
1526	add	v14.4s,v14.4s,v15.4s
1527	ror	w11,w11,#20
1528	add	v18.4s,v18.4s,v19.4s
1529	ror	w12,w12,#20
1530	add	v22.4s,v22.4s,v23.4s
1531	ror	w9,w9,#20
1532	eor	v24.16b,v1.16b,v2.16b
1533	add	w5,w5,w10
1534	eor	v25.16b,v5.16b,v6.16b
1535	add	w6,w6,w11
1536	eor	v26.16b,v9.16b,v10.16b
1537	add	w7,w7,w12
1538	eor	v27.16b,v13.16b,v14.16b
1539	add	w8,w8,w9
1540	eor	v28.16b,v17.16b,v18.16b
1541	eor	w21,w21,w5
1542	eor	v29.16b,v21.16b,v22.16b
1543	eor	w17,w17,w6
1544	ushr	v1.4s,v24.4s,#25
1545	eor	w19,w19,w7
1546	ushr	v5.4s,v25.4s,#25
1547	eor	w20,w20,w8
1548	ushr	v9.4s,v26.4s,#25
1549	ror	w21,w21,#24
1550	ushr	v13.4s,v27.4s,#25
1551	ror	w17,w17,#24
1552	ushr	v17.4s,v28.4s,#25
1553	ror	w19,w19,#24
1554	ushr	v21.4s,v29.4s,#25
1555	ror	w20,w20,#24
1556	sli	v1.4s,v24.4s,#7
1557	add	w15,w15,w21
1558	sli	v5.4s,v25.4s,#7
1559	add	w16,w16,w17
1560	sli	v9.4s,v26.4s,#7
1561	add	w13,w13,w19
1562	sli	v13.4s,v27.4s,#7
1563	add	w14,w14,w20
1564	sli	v17.4s,v28.4s,#7
1565	eor	w10,w10,w15
1566	sli	v21.4s,v29.4s,#7
1567	eor	w11,w11,w16
1568	ext	v2.16b,v2.16b,v2.16b,#8
1569	eor	w12,w12,w13
1570	ext	v6.16b,v6.16b,v6.16b,#8
1571	eor	w9,w9,w14
1572	ext	v10.16b,v10.16b,v10.16b,#8
1573	ror	w10,w10,#25
1574	ext	v14.16b,v14.16b,v14.16b,#8
1575	ror	w11,w11,#25
1576	ext	v18.16b,v18.16b,v18.16b,#8
1577	ror	w12,w12,#25
1578	ext	v22.16b,v22.16b,v22.16b,#8
1579	ror	w9,w9,#25
1580	ext	v3.16b,v3.16b,v3.16b,#12
1581	ext	v7.16b,v7.16b,v7.16b,#12
1582	ext	v11.16b,v11.16b,v11.16b,#12
1583	ext	v15.16b,v15.16b,v15.16b,#12
1584	ext	v19.16b,v19.16b,v19.16b,#12
1585	ext	v23.16b,v23.16b,v23.16b,#12
1586	ext	v1.16b,v1.16b,v1.16b,#4
1587	ext	v5.16b,v5.16b,v5.16b,#4
1588	ext	v9.16b,v9.16b,v9.16b,#4
1589	ext	v13.16b,v13.16b,v13.16b,#4
1590	ext	v17.16b,v17.16b,v17.16b,#4
1591	ext	v21.16b,v21.16b,v21.16b,#4
1592	add	v0.4s,v0.4s,v1.4s
1593	add	w5,w5,w9
1594	add	v4.4s,v4.4s,v5.4s
1595	add	w6,w6,w10
1596	add	v8.4s,v8.4s,v9.4s
1597	add	w7,w7,w11
1598	add	v12.4s,v12.4s,v13.4s
1599	add	w8,w8,w12
1600	add	v16.4s,v16.4s,v17.4s
1601	eor	w17,w17,w5
1602	add	v20.4s,v20.4s,v21.4s
1603	eor	w19,w19,w6
1604	eor	v3.16b,v3.16b,v0.16b
1605	eor	w20,w20,w7
1606	eor	v7.16b,v7.16b,v4.16b
1607	eor	w21,w21,w8
1608	eor	v11.16b,v11.16b,v8.16b
1609	ror	w17,w17,#16
1610	eor	v15.16b,v15.16b,v12.16b
1611	ror	w19,w19,#16
1612	eor	v19.16b,v19.16b,v16.16b
1613	ror	w20,w20,#16
1614	eor	v23.16b,v23.16b,v20.16b
1615	ror	w21,w21,#16
1616	rev32	v3.8h,v3.8h
1617	add	w13,w13,w17
1618	rev32	v7.8h,v7.8h
1619	add	w14,w14,w19
1620	rev32	v11.8h,v11.8h
1621	add	w15,w15,w20
1622	rev32	v15.8h,v15.8h
1623	add	w16,w16,w21
1624	rev32	v19.8h,v19.8h
1625	eor	w9,w9,w13
1626	rev32	v23.8h,v23.8h
1627	eor	w10,w10,w14
1628	add	v2.4s,v2.4s,v3.4s
1629	eor	w11,w11,w15
1630	add	v6.4s,v6.4s,v7.4s
1631	eor	w12,w12,w16
1632	add	v10.4s,v10.4s,v11.4s
1633	ror	w9,w9,#20
1634	add	v14.4s,v14.4s,v15.4s
1635	ror	w10,w10,#20
1636	add	v18.4s,v18.4s,v19.4s
1637	ror	w11,w11,#20
1638	add	v22.4s,v22.4s,v23.4s
1639	ror	w12,w12,#20
1640	eor	v24.16b,v1.16b,v2.16b
1641	add	w5,w5,w9
1642	eor	v25.16b,v5.16b,v6.16b
1643	add	w6,w6,w10
1644	eor	v26.16b,v9.16b,v10.16b
1645	add	w7,w7,w11
1646	eor	v27.16b,v13.16b,v14.16b
1647	add	w8,w8,w12
1648	eor	v28.16b,v17.16b,v18.16b
1649	eor	w17,w17,w5
1650	eor	v29.16b,v21.16b,v22.16b
1651	eor	w19,w19,w6
1652	ushr	v1.4s,v24.4s,#20
1653	eor	w20,w20,w7
1654	ushr	v5.4s,v25.4s,#20
1655	eor	w21,w21,w8
1656	ushr	v9.4s,v26.4s,#20
1657	ror	w17,w17,#24
1658	ushr	v13.4s,v27.4s,#20
1659	ror	w19,w19,#24
1660	ushr	v17.4s,v28.4s,#20
1661	ror	w20,w20,#24
1662	ushr	v21.4s,v29.4s,#20
1663	ror	w21,w21,#24
1664	sli	v1.4s,v24.4s,#12
1665	add	w13,w13,w17
1666	sli	v5.4s,v25.4s,#12
1667	add	w14,w14,w19
1668	sli	v9.4s,v26.4s,#12
1669	add	w15,w15,w20
1670	sli	v13.4s,v27.4s,#12
1671	add	w16,w16,w21
1672	sli	v17.4s,v28.4s,#12
1673	eor	w9,w9,w13
1674	sli	v21.4s,v29.4s,#12
1675	eor	w10,w10,w14
1676	add	v0.4s,v0.4s,v1.4s
1677	eor	w11,w11,w15
1678	add	v4.4s,v4.4s,v5.4s
1679	eor	w12,w12,w16
1680	add	v8.4s,v8.4s,v9.4s
1681	ror	w9,w9,#25
1682	add	v12.4s,v12.4s,v13.4s
1683	ror	w10,w10,#25
1684	add	v16.4s,v16.4s,v17.4s
1685	ror	w11,w11,#25
1686	add	v20.4s,v20.4s,v21.4s
1687	ror	w12,w12,#25
1688	eor	v24.16b,v3.16b,v0.16b
1689	add	w5,w5,w10
1690	eor	v25.16b,v7.16b,v4.16b
1691	add	w6,w6,w11
1692	eor	v26.16b,v11.16b,v8.16b
1693	add	w7,w7,w12
1694	eor	v27.16b,v15.16b,v12.16b
1695	add	w8,w8,w9
1696	eor	v28.16b,v19.16b,v16.16b
1697	eor	w21,w21,w5
1698	eor	v29.16b,v23.16b,v20.16b
1699	eor	w17,w17,w6
1700	ushr	v3.4s,v24.4s,#24
1701	eor	w19,w19,w7
1702	ushr	v7.4s,v25.4s,#24
1703	eor	w20,w20,w8
1704	ushr	v11.4s,v26.4s,#24
1705	ror	w21,w21,#16
1706	ushr	v15.4s,v27.4s,#24
1707	ror	w17,w17,#16
1708	ushr	v19.4s,v28.4s,#24
1709	ror	w19,w19,#16
1710	ushr	v23.4s,v29.4s,#24
1711	ror	w20,w20,#16
1712	sli	v3.4s,v24.4s,#8
1713	add	w15,w15,w21
1714	sli	v7.4s,v25.4s,#8
1715	add	w16,w16,w17
1716	sli	v11.4s,v26.4s,#8
1717	add	w13,w13,w19
1718	sli	v15.4s,v27.4s,#8
1719	add	w14,w14,w20
1720	sli	v19.4s,v28.4s,#8
1721	eor	w10,w10,w15
1722	sli	v23.4s,v29.4s,#8
1723	eor	w11,w11,w16
1724	add	v2.4s,v2.4s,v3.4s
1725	eor	w12,w12,w13
1726	add	v6.4s,v6.4s,v7.4s
1727	eor	w9,w9,w14
1728	add	v10.4s,v10.4s,v11.4s
1729	ror	w10,w10,#20
1730	add	v14.4s,v14.4s,v15.4s
1731	ror	w11,w11,#20
1732	add	v18.4s,v18.4s,v19.4s
1733	ror	w12,w12,#20
1734	add	v22.4s,v22.4s,v23.4s
1735	ror	w9,w9,#20
1736	eor	v24.16b,v1.16b,v2.16b
1737	add	w5,w5,w10
1738	eor	v25.16b,v5.16b,v6.16b
1739	add	w6,w6,w11
1740	eor	v26.16b,v9.16b,v10.16b
1741	add	w7,w7,w12
1742	eor	v27.16b,v13.16b,v14.16b
1743	add	w8,w8,w9
1744	eor	v28.16b,v17.16b,v18.16b
1745	eor	w21,w21,w5
1746	eor	v29.16b,v21.16b,v22.16b
1747	eor	w17,w17,w6
1748	ushr	v1.4s,v24.4s,#25
1749	eor	w19,w19,w7
1750	ushr	v5.4s,v25.4s,#25
1751	eor	w20,w20,w8
1752	ushr	v9.4s,v26.4s,#25
1753	ror	w21,w21,#24
1754	ushr	v13.4s,v27.4s,#25
1755	ror	w17,w17,#24
1756	ushr	v17.4s,v28.4s,#25
1757	ror	w19,w19,#24
1758	ushr	v21.4s,v29.4s,#25
1759	ror	w20,w20,#24
1760	sli	v1.4s,v24.4s,#7
1761	add	w15,w15,w21
1762	sli	v5.4s,v25.4s,#7
1763	add	w16,w16,w17
1764	sli	v9.4s,v26.4s,#7
1765	add	w13,w13,w19
1766	sli	v13.4s,v27.4s,#7
1767	add	w14,w14,w20
1768	sli	v17.4s,v28.4s,#7
1769	eor	w10,w10,w15
1770	sli	v21.4s,v29.4s,#7
1771	eor	w11,w11,w16
1772	ext	v2.16b,v2.16b,v2.16b,#8
1773	eor	w12,w12,w13
1774	ext	v6.16b,v6.16b,v6.16b,#8
1775	eor	w9,w9,w14
1776	ext	v10.16b,v10.16b,v10.16b,#8
1777	ror	w10,w10,#25
1778	ext	v14.16b,v14.16b,v14.16b,#8
1779	ror	w11,w11,#25
1780	ext	v18.16b,v18.16b,v18.16b,#8
1781	ror	w12,w12,#25
1782	ext	v22.16b,v22.16b,v22.16b,#8
1783	ror	w9,w9,#25
1784	ext	v3.16b,v3.16b,v3.16b,#4
1785	ext	v7.16b,v7.16b,v7.16b,#4
1786	ext	v11.16b,v11.16b,v11.16b,#4
1787	ext	v15.16b,v15.16b,v15.16b,#4
1788	ext	v19.16b,v19.16b,v19.16b,#4
1789	ext	v23.16b,v23.16b,v23.16b,#4
1790	ext	v1.16b,v1.16b,v1.16b,#12
1791	ext	v5.16b,v5.16b,v5.16b,#12
1792	ext	v9.16b,v9.16b,v9.16b,#12
1793	ext	v13.16b,v13.16b,v13.16b,#12
1794	ext	v17.16b,v17.16b,v17.16b,#12
1795	ext	v21.16b,v21.16b,v21.16b,#12
1796	cbnz	x4,Loop_lower_neon
1797
1798	add	w5,w5,w22		// accumulate key block
1799	ldp	q24,q25,[sp,#0]
1800	add	x6,x6,x22,lsr#32
1801	ldp	q26,q27,[sp,#32]
1802	add	w7,w7,w23
1803	ldp	q28,q29,[sp,#64]
1804	add	x8,x8,x23,lsr#32
1805	add	v0.4s,v0.4s,v24.4s
1806	add	w9,w9,w24
1807	add	v4.4s,v4.4s,v24.4s
1808	add	x10,x10,x24,lsr#32
1809	add	v8.4s,v8.4s,v24.4s
1810	add	w11,w11,w25
1811	add	v12.4s,v12.4s,v24.4s
1812	add	x12,x12,x25,lsr#32
1813	add	v16.4s,v16.4s,v24.4s
1814	add	w13,w13,w26
1815	add	v20.4s,v20.4s,v24.4s
1816	add	x14,x14,x26,lsr#32
1817	add	v2.4s,v2.4s,v26.4s
1818	add	w15,w15,w27
1819	add	v6.4s,v6.4s,v26.4s
1820	add	x16,x16,x27,lsr#32
1821	add	v10.4s,v10.4s,v26.4s
1822	add	w17,w17,w28
1823	add	v14.4s,v14.4s,v26.4s
1824	add	x19,x19,x28,lsr#32
1825	add	v18.4s,v18.4s,v26.4s
1826	add	w20,w20,w30
1827	add	v22.4s,v22.4s,v26.4s
1828	add	x21,x21,x30,lsr#32
1829	add	v19.4s,v19.4s,v31.4s			// +4
1830	add	x5,x5,x6,lsl#32	// pack
1831	add	v23.4s,v23.4s,v31.4s			// +4
1832	add	x7,x7,x8,lsl#32
1833	add	v3.4s,v3.4s,v27.4s
1834	ldp	x6,x8,[x1,#0]		// load input
1835	add	v7.4s,v7.4s,v28.4s
1836	add	x9,x9,x10,lsl#32
1837	add	v11.4s,v11.4s,v29.4s
1838	add	x11,x11,x12,lsl#32
1839	add	v15.4s,v15.4s,v30.4s
1840	ldp	x10,x12,[x1,#16]
1841	add	v19.4s,v19.4s,v27.4s
1842	add	x13,x13,x14,lsl#32
1843	add	v23.4s,v23.4s,v28.4s
1844	add	x15,x15,x16,lsl#32
1845	add	v1.4s,v1.4s,v25.4s
1846	ldp	x14,x16,[x1,#32]
1847	add	v5.4s,v5.4s,v25.4s
1848	add	x17,x17,x19,lsl#32
1849	add	v9.4s,v9.4s,v25.4s
1850	add	x20,x20,x21,lsl#32
1851	add	v13.4s,v13.4s,v25.4s
1852	ldp	x19,x21,[x1,#48]
1853	add	v17.4s,v17.4s,v25.4s
1854	add	x1,x1,#64
1855	add	v21.4s,v21.4s,v25.4s
1856
1857#ifdef	__AARCH64EB__
1858	rev	x5,x5
1859	rev	x7,x7
1860	rev	x9,x9
1861	rev	x11,x11
1862	rev	x13,x13
1863	rev	x15,x15
1864	rev	x17,x17
1865	rev	x20,x20
1866#endif
1867	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1868	eor	x5,x5,x6
1869	eor	x7,x7,x8
1870	eor	x9,x9,x10
1871	eor	x11,x11,x12
1872	eor	x13,x13,x14
1873	eor	v0.16b,v0.16b,v24.16b
1874	eor	x15,x15,x16
1875	eor	v1.16b,v1.16b,v25.16b
1876	eor	x17,x17,x19
1877	eor	v2.16b,v2.16b,v26.16b
1878	eor	x20,x20,x21
1879	eor	v3.16b,v3.16b,v27.16b
1880	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1881
1882	stp	x5,x7,[x0,#0]		// store output
1883	add	x28,x28,#7			// increment counter
1884	stp	x9,x11,[x0,#16]
1885	stp	x13,x15,[x0,#32]
1886	stp	x17,x20,[x0,#48]
1887	add	x0,x0,#64
1888	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1889
1890	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1891	eor	v4.16b,v4.16b,v24.16b
1892	eor	v5.16b,v5.16b,v25.16b
1893	eor	v6.16b,v6.16b,v26.16b
1894	eor	v7.16b,v7.16b,v27.16b
1895	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1896
1897	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1898	eor	v8.16b,v8.16b,v0.16b
1899	ldp	q24,q25,[sp,#0]
1900	eor	v9.16b,v9.16b,v1.16b
1901	ldp	q26,q27,[sp,#32]
1902	eor	v10.16b,v10.16b,v2.16b
1903	eor	v11.16b,v11.16b,v3.16b
1904	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1905
1906	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1907	eor	v12.16b,v12.16b,v4.16b
1908	eor	v13.16b,v13.16b,v5.16b
1909	eor	v14.16b,v14.16b,v6.16b
1910	eor	v15.16b,v15.16b,v7.16b
1911	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1912
1913	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1914	eor	v16.16b,v16.16b,v8.16b
1915	eor	v17.16b,v17.16b,v9.16b
1916	eor	v18.16b,v18.16b,v10.16b
1917	eor	v19.16b,v19.16b,v11.16b
1918	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1919
1920	shl	v0.4s,v31.4s,#1			// 4 -> 8
1921	eor	v20.16b,v20.16b,v12.16b
1922	eor	v21.16b,v21.16b,v13.16b
1923	eor	v22.16b,v22.16b,v14.16b
1924	eor	v23.16b,v23.16b,v15.16b
1925	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1926
1927	add	v27.4s,v27.4s,v0.4s			// += 8
1928	add	v28.4s,v28.4s,v0.4s
1929	add	v29.4s,v29.4s,v0.4s
1930	add	v30.4s,v30.4s,v0.4s
1931
1932	b.hs	Loop_outer_512_neon
1933
1934	adds	x2,x2,#512
1935	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1936
1937	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1938	ldp	d10,d11,[sp,#128+16]
1939	ldp	d12,d13,[sp,#128+32]
1940	ldp	d14,d15,[sp,#128+48]
1941
1942	stp	q24,q31,[sp,#0]		// wipe off-load area
1943	stp	q24,q31,[sp,#32]
1944	stp	q24,q31,[sp,#64]
1945
1946	b.eq	Ldone_512_neon
1947
1948	cmp	x2,#192
1949	sub	v27.4s,v27.4s,v0.4s			// -= 1
1950	sub	v28.4s,v28.4s,v0.4s
1951	sub	v29.4s,v29.4s,v0.4s
1952	add	sp,sp,#128
1953	b.hs	Loop_outer_neon
1954
1955	eor	v25.16b,v25.16b,v25.16b
1956	eor	v26.16b,v26.16b,v26.16b
1957	eor	v27.16b,v27.16b,v27.16b
1958	eor	v28.16b,v28.16b,v28.16b
1959	eor	v29.16b,v29.16b,v29.16b
1960	eor	v30.16b,v30.16b,v30.16b
1961	b	Loop_outer
1962
1963Ldone_512_neon:
1964	ldp	x19,x20,[x29,#16]
1965	add	sp,sp,#128+64
1966	ldp	x21,x22,[x29,#32]
1967	ldp	x23,x24,[x29,#48]
1968	ldp	x25,x26,[x29,#64]
1969	ldp	x27,x28,[x29,#80]
1970	ldp	x29,x30,[sp],#96
1971	AARCH64_VALIDATE_LINK_REGISTER
1972	ret
1973
1974#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1975