1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
7.text
8
9.extern	OPENSSL_ia32cap_P
10.hidden OPENSSL_ia32cap_P
11
12.section	.rodata
13.align	64
14.Lzero:
15.long	0,0,0,0
16.Lone:
17.long	1,0,0,0
18.Linc:
19.long	0,1,2,3
20.Lfour:
21.long	4,4,4,4
22.Lincy:
23.long	0,2,4,6,1,3,5,7
24.Leight:
25.long	8,8,8,8,8,8,8,8
26.Lrot16:
27.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
28.Lrot24:
29.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
30.Lsigma:
31.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
32.align	64
33.Lzeroz:
34.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
35.Lfourz:
36.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
37.Lincz:
38.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
39.Lsixteen:
40.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
41.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
42.text
43.globl	ChaCha20_ctr32
44.hidden ChaCha20_ctr32
45.type	ChaCha20_ctr32,@function
46.align	64
47ChaCha20_ctr32:
48.cfi_startproc
49_CET_ENDBR
50	cmpq	$0,%rdx
51	je	.Lno_data
52	movq	OPENSSL_ia32cap_P+4(%rip),%r10
53	testl	$512,%r10d
54	jnz	.LChaCha20_ssse3
55
56	pushq	%rbx
57.cfi_adjust_cfa_offset	8
58.cfi_offset	rbx,-16
59	pushq	%rbp
60.cfi_adjust_cfa_offset	8
61.cfi_offset	rbp,-24
62	pushq	%r12
63.cfi_adjust_cfa_offset	8
64.cfi_offset	r12,-32
65	pushq	%r13
66.cfi_adjust_cfa_offset	8
67.cfi_offset	r13,-40
68	pushq	%r14
69.cfi_adjust_cfa_offset	8
70.cfi_offset	r14,-48
71	pushq	%r15
72.cfi_adjust_cfa_offset	8
73.cfi_offset	r15,-56
74	subq	$64+24,%rsp
75.cfi_adjust_cfa_offset	88
76.Lctr32_body:
77
78
79	movdqu	(%rcx),%xmm1
80	movdqu	16(%rcx),%xmm2
81	movdqu	(%r8),%xmm3
82	movdqa	.Lone(%rip),%xmm4
83
84
85	movdqa	%xmm1,16(%rsp)
86	movdqa	%xmm2,32(%rsp)
87	movdqa	%xmm3,48(%rsp)
88	movq	%rdx,%rbp
89	jmp	.Loop_outer
90
91.align	32
92.Loop_outer:
93	movl	$0x61707865,%eax
94	movl	$0x3320646e,%ebx
95	movl	$0x79622d32,%ecx
96	movl	$0x6b206574,%edx
97	movl	16(%rsp),%r8d
98	movl	20(%rsp),%r9d
99	movl	24(%rsp),%r10d
100	movl	28(%rsp),%r11d
101	movd	%xmm3,%r12d
102	movl	52(%rsp),%r13d
103	movl	56(%rsp),%r14d
104	movl	60(%rsp),%r15d
105
106	movq	%rbp,64+0(%rsp)
107	movl	$10,%ebp
108	movq	%rsi,64+8(%rsp)
109.byte	102,72,15,126,214
110	movq	%rdi,64+16(%rsp)
111	movq	%rsi,%rdi
112	shrq	$32,%rdi
113	jmp	.Loop
114
115.align	32
116.Loop:
117	addl	%r8d,%eax
118	xorl	%eax,%r12d
119	roll	$16,%r12d
120	addl	%r9d,%ebx
121	xorl	%ebx,%r13d
122	roll	$16,%r13d
123	addl	%r12d,%esi
124	xorl	%esi,%r8d
125	roll	$12,%r8d
126	addl	%r13d,%edi
127	xorl	%edi,%r9d
128	roll	$12,%r9d
129	addl	%r8d,%eax
130	xorl	%eax,%r12d
131	roll	$8,%r12d
132	addl	%r9d,%ebx
133	xorl	%ebx,%r13d
134	roll	$8,%r13d
135	addl	%r12d,%esi
136	xorl	%esi,%r8d
137	roll	$7,%r8d
138	addl	%r13d,%edi
139	xorl	%edi,%r9d
140	roll	$7,%r9d
141	movl	%esi,32(%rsp)
142	movl	%edi,36(%rsp)
143	movl	40(%rsp),%esi
144	movl	44(%rsp),%edi
145	addl	%r10d,%ecx
146	xorl	%ecx,%r14d
147	roll	$16,%r14d
148	addl	%r11d,%edx
149	xorl	%edx,%r15d
150	roll	$16,%r15d
151	addl	%r14d,%esi
152	xorl	%esi,%r10d
153	roll	$12,%r10d
154	addl	%r15d,%edi
155	xorl	%edi,%r11d
156	roll	$12,%r11d
157	addl	%r10d,%ecx
158	xorl	%ecx,%r14d
159	roll	$8,%r14d
160	addl	%r11d,%edx
161	xorl	%edx,%r15d
162	roll	$8,%r15d
163	addl	%r14d,%esi
164	xorl	%esi,%r10d
165	roll	$7,%r10d
166	addl	%r15d,%edi
167	xorl	%edi,%r11d
168	roll	$7,%r11d
169	addl	%r9d,%eax
170	xorl	%eax,%r15d
171	roll	$16,%r15d
172	addl	%r10d,%ebx
173	xorl	%ebx,%r12d
174	roll	$16,%r12d
175	addl	%r15d,%esi
176	xorl	%esi,%r9d
177	roll	$12,%r9d
178	addl	%r12d,%edi
179	xorl	%edi,%r10d
180	roll	$12,%r10d
181	addl	%r9d,%eax
182	xorl	%eax,%r15d
183	roll	$8,%r15d
184	addl	%r10d,%ebx
185	xorl	%ebx,%r12d
186	roll	$8,%r12d
187	addl	%r15d,%esi
188	xorl	%esi,%r9d
189	roll	$7,%r9d
190	addl	%r12d,%edi
191	xorl	%edi,%r10d
192	roll	$7,%r10d
193	movl	%esi,40(%rsp)
194	movl	%edi,44(%rsp)
195	movl	32(%rsp),%esi
196	movl	36(%rsp),%edi
197	addl	%r11d,%ecx
198	xorl	%ecx,%r13d
199	roll	$16,%r13d
200	addl	%r8d,%edx
201	xorl	%edx,%r14d
202	roll	$16,%r14d
203	addl	%r13d,%esi
204	xorl	%esi,%r11d
205	roll	$12,%r11d
206	addl	%r14d,%edi
207	xorl	%edi,%r8d
208	roll	$12,%r8d
209	addl	%r11d,%ecx
210	xorl	%ecx,%r13d
211	roll	$8,%r13d
212	addl	%r8d,%edx
213	xorl	%edx,%r14d
214	roll	$8,%r14d
215	addl	%r13d,%esi
216	xorl	%esi,%r11d
217	roll	$7,%r11d
218	addl	%r14d,%edi
219	xorl	%edi,%r8d
220	roll	$7,%r8d
221	decl	%ebp
222	jnz	.Loop
223	movl	%edi,36(%rsp)
224	movl	%esi,32(%rsp)
225	movq	64(%rsp),%rbp
226	movdqa	%xmm2,%xmm1
227	movq	64+8(%rsp),%rsi
228	paddd	%xmm4,%xmm3
229	movq	64+16(%rsp),%rdi
230
231	addl	$0x61707865,%eax
232	addl	$0x3320646e,%ebx
233	addl	$0x79622d32,%ecx
234	addl	$0x6b206574,%edx
235	addl	16(%rsp),%r8d
236	addl	20(%rsp),%r9d
237	addl	24(%rsp),%r10d
238	addl	28(%rsp),%r11d
239	addl	48(%rsp),%r12d
240	addl	52(%rsp),%r13d
241	addl	56(%rsp),%r14d
242	addl	60(%rsp),%r15d
243	paddd	32(%rsp),%xmm1
244
245	cmpq	$64,%rbp
246	jb	.Ltail
247
248	xorl	0(%rsi),%eax
249	xorl	4(%rsi),%ebx
250	xorl	8(%rsi),%ecx
251	xorl	12(%rsi),%edx
252	xorl	16(%rsi),%r8d
253	xorl	20(%rsi),%r9d
254	xorl	24(%rsi),%r10d
255	xorl	28(%rsi),%r11d
256	movdqu	32(%rsi),%xmm0
257	xorl	48(%rsi),%r12d
258	xorl	52(%rsi),%r13d
259	xorl	56(%rsi),%r14d
260	xorl	60(%rsi),%r15d
261	leaq	64(%rsi),%rsi
262	pxor	%xmm1,%xmm0
263
264	movdqa	%xmm2,32(%rsp)
265	movd	%xmm3,48(%rsp)
266
267	movl	%eax,0(%rdi)
268	movl	%ebx,4(%rdi)
269	movl	%ecx,8(%rdi)
270	movl	%edx,12(%rdi)
271	movl	%r8d,16(%rdi)
272	movl	%r9d,20(%rdi)
273	movl	%r10d,24(%rdi)
274	movl	%r11d,28(%rdi)
275	movdqu	%xmm0,32(%rdi)
276	movl	%r12d,48(%rdi)
277	movl	%r13d,52(%rdi)
278	movl	%r14d,56(%rdi)
279	movl	%r15d,60(%rdi)
280	leaq	64(%rdi),%rdi
281
282	subq	$64,%rbp
283	jnz	.Loop_outer
284
285	jmp	.Ldone
286
287.align	16
288.Ltail:
289	movl	%eax,0(%rsp)
290	movl	%ebx,4(%rsp)
291	xorq	%rbx,%rbx
292	movl	%ecx,8(%rsp)
293	movl	%edx,12(%rsp)
294	movl	%r8d,16(%rsp)
295	movl	%r9d,20(%rsp)
296	movl	%r10d,24(%rsp)
297	movl	%r11d,28(%rsp)
298	movdqa	%xmm1,32(%rsp)
299	movl	%r12d,48(%rsp)
300	movl	%r13d,52(%rsp)
301	movl	%r14d,56(%rsp)
302	movl	%r15d,60(%rsp)
303
304.Loop_tail:
305	movzbl	(%rsi,%rbx,1),%eax
306	movzbl	(%rsp,%rbx,1),%edx
307	leaq	1(%rbx),%rbx
308	xorl	%edx,%eax
309	movb	%al,-1(%rdi,%rbx,1)
310	decq	%rbp
311	jnz	.Loop_tail
312
313.Ldone:
314	leaq	64+24+48(%rsp),%rsi
315	movq	-48(%rsi),%r15
316.cfi_restore	r15
317	movq	-40(%rsi),%r14
318.cfi_restore	r14
319	movq	-32(%rsi),%r13
320.cfi_restore	r13
321	movq	-24(%rsi),%r12
322.cfi_restore	r12
323	movq	-16(%rsi),%rbp
324.cfi_restore	rbp
325	movq	-8(%rsi),%rbx
326.cfi_restore	rbx
327	leaq	(%rsi),%rsp
328.cfi_adjust_cfa_offset	-136
329.Lno_data:
330	ret
331.cfi_endproc
332.size	ChaCha20_ctr32,.-ChaCha20_ctr32
333.type	ChaCha20_ssse3,@function
334.align	32
335ChaCha20_ssse3:
336.LChaCha20_ssse3:
337.cfi_startproc
338	movq	%rsp,%r9
339.cfi_def_cfa_register	r9
340	cmpq	$128,%rdx
341	ja	.LChaCha20_4x
342
343.Ldo_sse3_after_all:
344	subq	$64+8,%rsp
345	movdqa	.Lsigma(%rip),%xmm0
346	movdqu	(%rcx),%xmm1
347	movdqu	16(%rcx),%xmm2
348	movdqu	(%r8),%xmm3
349	movdqa	.Lrot16(%rip),%xmm6
350	movdqa	.Lrot24(%rip),%xmm7
351
352	movdqa	%xmm0,0(%rsp)
353	movdqa	%xmm1,16(%rsp)
354	movdqa	%xmm2,32(%rsp)
355	movdqa	%xmm3,48(%rsp)
356	movq	$10,%r8
357	jmp	.Loop_ssse3
358
359.align	32
360.Loop_outer_ssse3:
361	movdqa	.Lone(%rip),%xmm3
362	movdqa	0(%rsp),%xmm0
363	movdqa	16(%rsp),%xmm1
364	movdqa	32(%rsp),%xmm2
365	paddd	48(%rsp),%xmm3
366	movq	$10,%r8
367	movdqa	%xmm3,48(%rsp)
368	jmp	.Loop_ssse3
369
370.align	32
371.Loop_ssse3:
372	paddd	%xmm1,%xmm0
373	pxor	%xmm0,%xmm3
374.byte	102,15,56,0,222
375	paddd	%xmm3,%xmm2
376	pxor	%xmm2,%xmm1
377	movdqa	%xmm1,%xmm4
378	psrld	$20,%xmm1
379	pslld	$12,%xmm4
380	por	%xmm4,%xmm1
381	paddd	%xmm1,%xmm0
382	pxor	%xmm0,%xmm3
383.byte	102,15,56,0,223
384	paddd	%xmm3,%xmm2
385	pxor	%xmm2,%xmm1
386	movdqa	%xmm1,%xmm4
387	psrld	$25,%xmm1
388	pslld	$7,%xmm4
389	por	%xmm4,%xmm1
390	pshufd	$78,%xmm2,%xmm2
391	pshufd	$57,%xmm1,%xmm1
392	pshufd	$147,%xmm3,%xmm3
393	nop
394	paddd	%xmm1,%xmm0
395	pxor	%xmm0,%xmm3
396.byte	102,15,56,0,222
397	paddd	%xmm3,%xmm2
398	pxor	%xmm2,%xmm1
399	movdqa	%xmm1,%xmm4
400	psrld	$20,%xmm1
401	pslld	$12,%xmm4
402	por	%xmm4,%xmm1
403	paddd	%xmm1,%xmm0
404	pxor	%xmm0,%xmm3
405.byte	102,15,56,0,223
406	paddd	%xmm3,%xmm2
407	pxor	%xmm2,%xmm1
408	movdqa	%xmm1,%xmm4
409	psrld	$25,%xmm1
410	pslld	$7,%xmm4
411	por	%xmm4,%xmm1
412	pshufd	$78,%xmm2,%xmm2
413	pshufd	$147,%xmm1,%xmm1
414	pshufd	$57,%xmm3,%xmm3
415	decq	%r8
416	jnz	.Loop_ssse3
417	paddd	0(%rsp),%xmm0
418	paddd	16(%rsp),%xmm1
419	paddd	32(%rsp),%xmm2
420	paddd	48(%rsp),%xmm3
421
422	cmpq	$64,%rdx
423	jb	.Ltail_ssse3
424
425	movdqu	0(%rsi),%xmm4
426	movdqu	16(%rsi),%xmm5
427	pxor	%xmm4,%xmm0
428	movdqu	32(%rsi),%xmm4
429	pxor	%xmm5,%xmm1
430	movdqu	48(%rsi),%xmm5
431	leaq	64(%rsi),%rsi
432	pxor	%xmm4,%xmm2
433	pxor	%xmm5,%xmm3
434
435	movdqu	%xmm0,0(%rdi)
436	movdqu	%xmm1,16(%rdi)
437	movdqu	%xmm2,32(%rdi)
438	movdqu	%xmm3,48(%rdi)
439	leaq	64(%rdi),%rdi
440
441	subq	$64,%rdx
442	jnz	.Loop_outer_ssse3
443
444	jmp	.Ldone_ssse3
445
446.align	16
447.Ltail_ssse3:
448	movdqa	%xmm0,0(%rsp)
449	movdqa	%xmm1,16(%rsp)
450	movdqa	%xmm2,32(%rsp)
451	movdqa	%xmm3,48(%rsp)
452	xorq	%r8,%r8
453
454.Loop_tail_ssse3:
455	movzbl	(%rsi,%r8,1),%eax
456	movzbl	(%rsp,%r8,1),%ecx
457	leaq	1(%r8),%r8
458	xorl	%ecx,%eax
459	movb	%al,-1(%rdi,%r8,1)
460	decq	%rdx
461	jnz	.Loop_tail_ssse3
462
463.Ldone_ssse3:
464	leaq	(%r9),%rsp
465.cfi_def_cfa_register	rsp
466.Lssse3_epilogue:
467	ret
468.cfi_endproc
469.size	ChaCha20_ssse3,.-ChaCha20_ssse3
470.type	ChaCha20_4x,@function
471.align	32
472ChaCha20_4x:
473.LChaCha20_4x:
474.cfi_startproc
475	movq	%rsp,%r9
476.cfi_def_cfa_register	r9
477	movq	%r10,%r11
478	shrq	$32,%r10
479	testq	$32,%r10
480	jnz	.LChaCha20_8x
481	cmpq	$192,%rdx
482	ja	.Lproceed4x
483
484	andq	$71303168,%r11
485	cmpq	$4194304,%r11
486	je	.Ldo_sse3_after_all
487
488.Lproceed4x:
489	subq	$0x140+8,%rsp
490	movdqa	.Lsigma(%rip),%xmm11
491	movdqu	(%rcx),%xmm15
492	movdqu	16(%rcx),%xmm7
493	movdqu	(%r8),%xmm3
494	leaq	256(%rsp),%rcx
495	leaq	.Lrot16(%rip),%r10
496	leaq	.Lrot24(%rip),%r11
497
498	pshufd	$0x00,%xmm11,%xmm8
499	pshufd	$0x55,%xmm11,%xmm9
500	movdqa	%xmm8,64(%rsp)
501	pshufd	$0xaa,%xmm11,%xmm10
502	movdqa	%xmm9,80(%rsp)
503	pshufd	$0xff,%xmm11,%xmm11
504	movdqa	%xmm10,96(%rsp)
505	movdqa	%xmm11,112(%rsp)
506
507	pshufd	$0x00,%xmm15,%xmm12
508	pshufd	$0x55,%xmm15,%xmm13
509	movdqa	%xmm12,128-256(%rcx)
510	pshufd	$0xaa,%xmm15,%xmm14
511	movdqa	%xmm13,144-256(%rcx)
512	pshufd	$0xff,%xmm15,%xmm15
513	movdqa	%xmm14,160-256(%rcx)
514	movdqa	%xmm15,176-256(%rcx)
515
516	pshufd	$0x00,%xmm7,%xmm4
517	pshufd	$0x55,%xmm7,%xmm5
518	movdqa	%xmm4,192-256(%rcx)
519	pshufd	$0xaa,%xmm7,%xmm6
520	movdqa	%xmm5,208-256(%rcx)
521	pshufd	$0xff,%xmm7,%xmm7
522	movdqa	%xmm6,224-256(%rcx)
523	movdqa	%xmm7,240-256(%rcx)
524
525	pshufd	$0x00,%xmm3,%xmm0
526	pshufd	$0x55,%xmm3,%xmm1
527	paddd	.Linc(%rip),%xmm0
528	pshufd	$0xaa,%xmm3,%xmm2
529	movdqa	%xmm1,272-256(%rcx)
530	pshufd	$0xff,%xmm3,%xmm3
531	movdqa	%xmm2,288-256(%rcx)
532	movdqa	%xmm3,304-256(%rcx)
533
534	jmp	.Loop_enter4x
535
536.align	32
537.Loop_outer4x:
538	movdqa	64(%rsp),%xmm8
539	movdqa	80(%rsp),%xmm9
540	movdqa	96(%rsp),%xmm10
541	movdqa	112(%rsp),%xmm11
542	movdqa	128-256(%rcx),%xmm12
543	movdqa	144-256(%rcx),%xmm13
544	movdqa	160-256(%rcx),%xmm14
545	movdqa	176-256(%rcx),%xmm15
546	movdqa	192-256(%rcx),%xmm4
547	movdqa	208-256(%rcx),%xmm5
548	movdqa	224-256(%rcx),%xmm6
549	movdqa	240-256(%rcx),%xmm7
550	movdqa	256-256(%rcx),%xmm0
551	movdqa	272-256(%rcx),%xmm1
552	movdqa	288-256(%rcx),%xmm2
553	movdqa	304-256(%rcx),%xmm3
554	paddd	.Lfour(%rip),%xmm0
555
556.Loop_enter4x:
557	movdqa	%xmm6,32(%rsp)
558	movdqa	%xmm7,48(%rsp)
559	movdqa	(%r10),%xmm7
560	movl	$10,%eax
561	movdqa	%xmm0,256-256(%rcx)
562	jmp	.Loop4x
563
564.align	32
565.Loop4x:
566	paddd	%xmm12,%xmm8
567	paddd	%xmm13,%xmm9
568	pxor	%xmm8,%xmm0
569	pxor	%xmm9,%xmm1
570.byte	102,15,56,0,199
571.byte	102,15,56,0,207
572	paddd	%xmm0,%xmm4
573	paddd	%xmm1,%xmm5
574	pxor	%xmm4,%xmm12
575	pxor	%xmm5,%xmm13
576	movdqa	%xmm12,%xmm6
577	pslld	$12,%xmm12
578	psrld	$20,%xmm6
579	movdqa	%xmm13,%xmm7
580	pslld	$12,%xmm13
581	por	%xmm6,%xmm12
582	psrld	$20,%xmm7
583	movdqa	(%r11),%xmm6
584	por	%xmm7,%xmm13
585	paddd	%xmm12,%xmm8
586	paddd	%xmm13,%xmm9
587	pxor	%xmm8,%xmm0
588	pxor	%xmm9,%xmm1
589.byte	102,15,56,0,198
590.byte	102,15,56,0,206
591	paddd	%xmm0,%xmm4
592	paddd	%xmm1,%xmm5
593	pxor	%xmm4,%xmm12
594	pxor	%xmm5,%xmm13
595	movdqa	%xmm12,%xmm7
596	pslld	$7,%xmm12
597	psrld	$25,%xmm7
598	movdqa	%xmm13,%xmm6
599	pslld	$7,%xmm13
600	por	%xmm7,%xmm12
601	psrld	$25,%xmm6
602	movdqa	(%r10),%xmm7
603	por	%xmm6,%xmm13
604	movdqa	%xmm4,0(%rsp)
605	movdqa	%xmm5,16(%rsp)
606	movdqa	32(%rsp),%xmm4
607	movdqa	48(%rsp),%xmm5
608	paddd	%xmm14,%xmm10
609	paddd	%xmm15,%xmm11
610	pxor	%xmm10,%xmm2
611	pxor	%xmm11,%xmm3
612.byte	102,15,56,0,215
613.byte	102,15,56,0,223
614	paddd	%xmm2,%xmm4
615	paddd	%xmm3,%xmm5
616	pxor	%xmm4,%xmm14
617	pxor	%xmm5,%xmm15
618	movdqa	%xmm14,%xmm6
619	pslld	$12,%xmm14
620	psrld	$20,%xmm6
621	movdqa	%xmm15,%xmm7
622	pslld	$12,%xmm15
623	por	%xmm6,%xmm14
624	psrld	$20,%xmm7
625	movdqa	(%r11),%xmm6
626	por	%xmm7,%xmm15
627	paddd	%xmm14,%xmm10
628	paddd	%xmm15,%xmm11
629	pxor	%xmm10,%xmm2
630	pxor	%xmm11,%xmm3
631.byte	102,15,56,0,214
632.byte	102,15,56,0,222
633	paddd	%xmm2,%xmm4
634	paddd	%xmm3,%xmm5
635	pxor	%xmm4,%xmm14
636	pxor	%xmm5,%xmm15
637	movdqa	%xmm14,%xmm7
638	pslld	$7,%xmm14
639	psrld	$25,%xmm7
640	movdqa	%xmm15,%xmm6
641	pslld	$7,%xmm15
642	por	%xmm7,%xmm14
643	psrld	$25,%xmm6
644	movdqa	(%r10),%xmm7
645	por	%xmm6,%xmm15
646	paddd	%xmm13,%xmm8
647	paddd	%xmm14,%xmm9
648	pxor	%xmm8,%xmm3
649	pxor	%xmm9,%xmm0
650.byte	102,15,56,0,223
651.byte	102,15,56,0,199
652	paddd	%xmm3,%xmm4
653	paddd	%xmm0,%xmm5
654	pxor	%xmm4,%xmm13
655	pxor	%xmm5,%xmm14
656	movdqa	%xmm13,%xmm6
657	pslld	$12,%xmm13
658	psrld	$20,%xmm6
659	movdqa	%xmm14,%xmm7
660	pslld	$12,%xmm14
661	por	%xmm6,%xmm13
662	psrld	$20,%xmm7
663	movdqa	(%r11),%xmm6
664	por	%xmm7,%xmm14
665	paddd	%xmm13,%xmm8
666	paddd	%xmm14,%xmm9
667	pxor	%xmm8,%xmm3
668	pxor	%xmm9,%xmm0
669.byte	102,15,56,0,222
670.byte	102,15,56,0,198
671	paddd	%xmm3,%xmm4
672	paddd	%xmm0,%xmm5
673	pxor	%xmm4,%xmm13
674	pxor	%xmm5,%xmm14
675	movdqa	%xmm13,%xmm7
676	pslld	$7,%xmm13
677	psrld	$25,%xmm7
678	movdqa	%xmm14,%xmm6
679	pslld	$7,%xmm14
680	por	%xmm7,%xmm13
681	psrld	$25,%xmm6
682	movdqa	(%r10),%xmm7
683	por	%xmm6,%xmm14
684	movdqa	%xmm4,32(%rsp)
685	movdqa	%xmm5,48(%rsp)
686	movdqa	0(%rsp),%xmm4
687	movdqa	16(%rsp),%xmm5
688	paddd	%xmm15,%xmm10
689	paddd	%xmm12,%xmm11
690	pxor	%xmm10,%xmm1
691	pxor	%xmm11,%xmm2
692.byte	102,15,56,0,207
693.byte	102,15,56,0,215
694	paddd	%xmm1,%xmm4
695	paddd	%xmm2,%xmm5
696	pxor	%xmm4,%xmm15
697	pxor	%xmm5,%xmm12
698	movdqa	%xmm15,%xmm6
699	pslld	$12,%xmm15
700	psrld	$20,%xmm6
701	movdqa	%xmm12,%xmm7
702	pslld	$12,%xmm12
703	por	%xmm6,%xmm15
704	psrld	$20,%xmm7
705	movdqa	(%r11),%xmm6
706	por	%xmm7,%xmm12
707	paddd	%xmm15,%xmm10
708	paddd	%xmm12,%xmm11
709	pxor	%xmm10,%xmm1
710	pxor	%xmm11,%xmm2
711.byte	102,15,56,0,206
712.byte	102,15,56,0,214
713	paddd	%xmm1,%xmm4
714	paddd	%xmm2,%xmm5
715	pxor	%xmm4,%xmm15
716	pxor	%xmm5,%xmm12
717	movdqa	%xmm15,%xmm7
718	pslld	$7,%xmm15
719	psrld	$25,%xmm7
720	movdqa	%xmm12,%xmm6
721	pslld	$7,%xmm12
722	por	%xmm7,%xmm15
723	psrld	$25,%xmm6
724	movdqa	(%r10),%xmm7
725	por	%xmm6,%xmm12
726	decl	%eax
727	jnz	.Loop4x
728
729	paddd	64(%rsp),%xmm8
730	paddd	80(%rsp),%xmm9
731	paddd	96(%rsp),%xmm10
732	paddd	112(%rsp),%xmm11
733
734	movdqa	%xmm8,%xmm6
735	punpckldq	%xmm9,%xmm8
736	movdqa	%xmm10,%xmm7
737	punpckldq	%xmm11,%xmm10
738	punpckhdq	%xmm9,%xmm6
739	punpckhdq	%xmm11,%xmm7
740	movdqa	%xmm8,%xmm9
741	punpcklqdq	%xmm10,%xmm8
742	movdqa	%xmm6,%xmm11
743	punpcklqdq	%xmm7,%xmm6
744	punpckhqdq	%xmm10,%xmm9
745	punpckhqdq	%xmm7,%xmm11
746	paddd	128-256(%rcx),%xmm12
747	paddd	144-256(%rcx),%xmm13
748	paddd	160-256(%rcx),%xmm14
749	paddd	176-256(%rcx),%xmm15
750
751	movdqa	%xmm8,0(%rsp)
752	movdqa	%xmm9,16(%rsp)
753	movdqa	32(%rsp),%xmm8
754	movdqa	48(%rsp),%xmm9
755
756	movdqa	%xmm12,%xmm10
757	punpckldq	%xmm13,%xmm12
758	movdqa	%xmm14,%xmm7
759	punpckldq	%xmm15,%xmm14
760	punpckhdq	%xmm13,%xmm10
761	punpckhdq	%xmm15,%xmm7
762	movdqa	%xmm12,%xmm13
763	punpcklqdq	%xmm14,%xmm12
764	movdqa	%xmm10,%xmm15
765	punpcklqdq	%xmm7,%xmm10
766	punpckhqdq	%xmm14,%xmm13
767	punpckhqdq	%xmm7,%xmm15
768	paddd	192-256(%rcx),%xmm4
769	paddd	208-256(%rcx),%xmm5
770	paddd	224-256(%rcx),%xmm8
771	paddd	240-256(%rcx),%xmm9
772
773	movdqa	%xmm6,32(%rsp)
774	movdqa	%xmm11,48(%rsp)
775
776	movdqa	%xmm4,%xmm14
777	punpckldq	%xmm5,%xmm4
778	movdqa	%xmm8,%xmm7
779	punpckldq	%xmm9,%xmm8
780	punpckhdq	%xmm5,%xmm14
781	punpckhdq	%xmm9,%xmm7
782	movdqa	%xmm4,%xmm5
783	punpcklqdq	%xmm8,%xmm4
784	movdqa	%xmm14,%xmm9
785	punpcklqdq	%xmm7,%xmm14
786	punpckhqdq	%xmm8,%xmm5
787	punpckhqdq	%xmm7,%xmm9
788	paddd	256-256(%rcx),%xmm0
789	paddd	272-256(%rcx),%xmm1
790	paddd	288-256(%rcx),%xmm2
791	paddd	304-256(%rcx),%xmm3
792
793	movdqa	%xmm0,%xmm8
794	punpckldq	%xmm1,%xmm0
795	movdqa	%xmm2,%xmm7
796	punpckldq	%xmm3,%xmm2
797	punpckhdq	%xmm1,%xmm8
798	punpckhdq	%xmm3,%xmm7
799	movdqa	%xmm0,%xmm1
800	punpcklqdq	%xmm2,%xmm0
801	movdqa	%xmm8,%xmm3
802	punpcklqdq	%xmm7,%xmm8
803	punpckhqdq	%xmm2,%xmm1
804	punpckhqdq	%xmm7,%xmm3
805	cmpq	$256,%rdx
806	jb	.Ltail4x
807
808	movdqu	0(%rsi),%xmm6
809	movdqu	16(%rsi),%xmm11
810	movdqu	32(%rsi),%xmm2
811	movdqu	48(%rsi),%xmm7
812	pxor	0(%rsp),%xmm6
813	pxor	%xmm12,%xmm11
814	pxor	%xmm4,%xmm2
815	pxor	%xmm0,%xmm7
816
817	movdqu	%xmm6,0(%rdi)
818	movdqu	64(%rsi),%xmm6
819	movdqu	%xmm11,16(%rdi)
820	movdqu	80(%rsi),%xmm11
821	movdqu	%xmm2,32(%rdi)
822	movdqu	96(%rsi),%xmm2
823	movdqu	%xmm7,48(%rdi)
824	movdqu	112(%rsi),%xmm7
825	leaq	128(%rsi),%rsi
826	pxor	16(%rsp),%xmm6
827	pxor	%xmm13,%xmm11
828	pxor	%xmm5,%xmm2
829	pxor	%xmm1,%xmm7
830
831	movdqu	%xmm6,64(%rdi)
832	movdqu	0(%rsi),%xmm6
833	movdqu	%xmm11,80(%rdi)
834	movdqu	16(%rsi),%xmm11
835	movdqu	%xmm2,96(%rdi)
836	movdqu	32(%rsi),%xmm2
837	movdqu	%xmm7,112(%rdi)
838	leaq	128(%rdi),%rdi
839	movdqu	48(%rsi),%xmm7
840	pxor	32(%rsp),%xmm6
841	pxor	%xmm10,%xmm11
842	pxor	%xmm14,%xmm2
843	pxor	%xmm8,%xmm7
844
845	movdqu	%xmm6,0(%rdi)
846	movdqu	64(%rsi),%xmm6
847	movdqu	%xmm11,16(%rdi)
848	movdqu	80(%rsi),%xmm11
849	movdqu	%xmm2,32(%rdi)
850	movdqu	96(%rsi),%xmm2
851	movdqu	%xmm7,48(%rdi)
852	movdqu	112(%rsi),%xmm7
853	leaq	128(%rsi),%rsi
854	pxor	48(%rsp),%xmm6
855	pxor	%xmm15,%xmm11
856	pxor	%xmm9,%xmm2
857	pxor	%xmm3,%xmm7
858	movdqu	%xmm6,64(%rdi)
859	movdqu	%xmm11,80(%rdi)
860	movdqu	%xmm2,96(%rdi)
861	movdqu	%xmm7,112(%rdi)
862	leaq	128(%rdi),%rdi
863
864	subq	$256,%rdx
865	jnz	.Loop_outer4x
866
867	jmp	.Ldone4x
868
869.Ltail4x:
870	cmpq	$192,%rdx
871	jae	.L192_or_more4x
872	cmpq	$128,%rdx
873	jae	.L128_or_more4x
874	cmpq	$64,%rdx
875	jae	.L64_or_more4x
876
877
878	xorq	%r10,%r10
879
880	movdqa	%xmm12,16(%rsp)
881	movdqa	%xmm4,32(%rsp)
882	movdqa	%xmm0,48(%rsp)
883	jmp	.Loop_tail4x
884
885.align	32
886.L64_or_more4x:
887	movdqu	0(%rsi),%xmm6
888	movdqu	16(%rsi),%xmm11
889	movdqu	32(%rsi),%xmm2
890	movdqu	48(%rsi),%xmm7
891	pxor	0(%rsp),%xmm6
892	pxor	%xmm12,%xmm11
893	pxor	%xmm4,%xmm2
894	pxor	%xmm0,%xmm7
895	movdqu	%xmm6,0(%rdi)
896	movdqu	%xmm11,16(%rdi)
897	movdqu	%xmm2,32(%rdi)
898	movdqu	%xmm7,48(%rdi)
899	je	.Ldone4x
900
901	movdqa	16(%rsp),%xmm6
902	leaq	64(%rsi),%rsi
903	xorq	%r10,%r10
904	movdqa	%xmm6,0(%rsp)
905	movdqa	%xmm13,16(%rsp)
906	leaq	64(%rdi),%rdi
907	movdqa	%xmm5,32(%rsp)
908	subq	$64,%rdx
909	movdqa	%xmm1,48(%rsp)
910	jmp	.Loop_tail4x
911
912.align	32
913.L128_or_more4x:
914	movdqu	0(%rsi),%xmm6
915	movdqu	16(%rsi),%xmm11
916	movdqu	32(%rsi),%xmm2
917	movdqu	48(%rsi),%xmm7
918	pxor	0(%rsp),%xmm6
919	pxor	%xmm12,%xmm11
920	pxor	%xmm4,%xmm2
921	pxor	%xmm0,%xmm7
922
923	movdqu	%xmm6,0(%rdi)
924	movdqu	64(%rsi),%xmm6
925	movdqu	%xmm11,16(%rdi)
926	movdqu	80(%rsi),%xmm11
927	movdqu	%xmm2,32(%rdi)
928	movdqu	96(%rsi),%xmm2
929	movdqu	%xmm7,48(%rdi)
930	movdqu	112(%rsi),%xmm7
931	pxor	16(%rsp),%xmm6
932	pxor	%xmm13,%xmm11
933	pxor	%xmm5,%xmm2
934	pxor	%xmm1,%xmm7
935	movdqu	%xmm6,64(%rdi)
936	movdqu	%xmm11,80(%rdi)
937	movdqu	%xmm2,96(%rdi)
938	movdqu	%xmm7,112(%rdi)
939	je	.Ldone4x
940
941	movdqa	32(%rsp),%xmm6
942	leaq	128(%rsi),%rsi
943	xorq	%r10,%r10
944	movdqa	%xmm6,0(%rsp)
945	movdqa	%xmm10,16(%rsp)
946	leaq	128(%rdi),%rdi
947	movdqa	%xmm14,32(%rsp)
948	subq	$128,%rdx
949	movdqa	%xmm8,48(%rsp)
950	jmp	.Loop_tail4x
951
952.align	32
953.L192_or_more4x:
954	movdqu	0(%rsi),%xmm6
955	movdqu	16(%rsi),%xmm11
956	movdqu	32(%rsi),%xmm2
957	movdqu	48(%rsi),%xmm7
958	pxor	0(%rsp),%xmm6
959	pxor	%xmm12,%xmm11
960	pxor	%xmm4,%xmm2
961	pxor	%xmm0,%xmm7
962
963	movdqu	%xmm6,0(%rdi)
964	movdqu	64(%rsi),%xmm6
965	movdqu	%xmm11,16(%rdi)
966	movdqu	80(%rsi),%xmm11
967	movdqu	%xmm2,32(%rdi)
968	movdqu	96(%rsi),%xmm2
969	movdqu	%xmm7,48(%rdi)
970	movdqu	112(%rsi),%xmm7
971	leaq	128(%rsi),%rsi
972	pxor	16(%rsp),%xmm6
973	pxor	%xmm13,%xmm11
974	pxor	%xmm5,%xmm2
975	pxor	%xmm1,%xmm7
976
977	movdqu	%xmm6,64(%rdi)
978	movdqu	0(%rsi),%xmm6
979	movdqu	%xmm11,80(%rdi)
980	movdqu	16(%rsi),%xmm11
981	movdqu	%xmm2,96(%rdi)
982	movdqu	32(%rsi),%xmm2
983	movdqu	%xmm7,112(%rdi)
984	leaq	128(%rdi),%rdi
985	movdqu	48(%rsi),%xmm7
986	pxor	32(%rsp),%xmm6
987	pxor	%xmm10,%xmm11
988	pxor	%xmm14,%xmm2
989	pxor	%xmm8,%xmm7
990	movdqu	%xmm6,0(%rdi)
991	movdqu	%xmm11,16(%rdi)
992	movdqu	%xmm2,32(%rdi)
993	movdqu	%xmm7,48(%rdi)
994	je	.Ldone4x
995
996	movdqa	48(%rsp),%xmm6
997	leaq	64(%rsi),%rsi
998	xorq	%r10,%r10
999	movdqa	%xmm6,0(%rsp)
1000	movdqa	%xmm15,16(%rsp)
1001	leaq	64(%rdi),%rdi
1002	movdqa	%xmm9,32(%rsp)
1003	subq	$192,%rdx
1004	movdqa	%xmm3,48(%rsp)
1005
1006.Loop_tail4x:
1007	movzbl	(%rsi,%r10,1),%eax
1008	movzbl	(%rsp,%r10,1),%ecx
1009	leaq	1(%r10),%r10
1010	xorl	%ecx,%eax
1011	movb	%al,-1(%rdi,%r10,1)
1012	decq	%rdx
1013	jnz	.Loop_tail4x
1014
1015.Ldone4x:
1016	leaq	(%r9),%rsp
1017.cfi_def_cfa_register	rsp
1018.L4x_epilogue:
1019	ret
1020.cfi_endproc
1021.size	ChaCha20_4x,.-ChaCha20_4x
1022.type	ChaCha20_8x,@function
1023.align	32
1024ChaCha20_8x:
1025.LChaCha20_8x:
1026.cfi_startproc
1027	movq	%rsp,%r9
1028.cfi_def_cfa_register	r9
1029	subq	$0x280+8,%rsp
1030	andq	$-32,%rsp
1031	vzeroupper
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042	vbroadcasti128	.Lsigma(%rip),%ymm11
1043	vbroadcasti128	(%rcx),%ymm3
1044	vbroadcasti128	16(%rcx),%ymm15
1045	vbroadcasti128	(%r8),%ymm7
1046	leaq	256(%rsp),%rcx
1047	leaq	512(%rsp),%rax
1048	leaq	.Lrot16(%rip),%r10
1049	leaq	.Lrot24(%rip),%r11
1050
1051	vpshufd	$0x00,%ymm11,%ymm8
1052	vpshufd	$0x55,%ymm11,%ymm9
1053	vmovdqa	%ymm8,128-256(%rcx)
1054	vpshufd	$0xaa,%ymm11,%ymm10
1055	vmovdqa	%ymm9,160-256(%rcx)
1056	vpshufd	$0xff,%ymm11,%ymm11
1057	vmovdqa	%ymm10,192-256(%rcx)
1058	vmovdqa	%ymm11,224-256(%rcx)
1059
1060	vpshufd	$0x00,%ymm3,%ymm0
1061	vpshufd	$0x55,%ymm3,%ymm1
1062	vmovdqa	%ymm0,256-256(%rcx)
1063	vpshufd	$0xaa,%ymm3,%ymm2
1064	vmovdqa	%ymm1,288-256(%rcx)
1065	vpshufd	$0xff,%ymm3,%ymm3
1066	vmovdqa	%ymm2,320-256(%rcx)
1067	vmovdqa	%ymm3,352-256(%rcx)
1068
1069	vpshufd	$0x00,%ymm15,%ymm12
1070	vpshufd	$0x55,%ymm15,%ymm13
1071	vmovdqa	%ymm12,384-512(%rax)
1072	vpshufd	$0xaa,%ymm15,%ymm14
1073	vmovdqa	%ymm13,416-512(%rax)
1074	vpshufd	$0xff,%ymm15,%ymm15
1075	vmovdqa	%ymm14,448-512(%rax)
1076	vmovdqa	%ymm15,480-512(%rax)
1077
1078	vpshufd	$0x00,%ymm7,%ymm4
1079	vpshufd	$0x55,%ymm7,%ymm5
1080	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1081	vpshufd	$0xaa,%ymm7,%ymm6
1082	vmovdqa	%ymm5,544-512(%rax)
1083	vpshufd	$0xff,%ymm7,%ymm7
1084	vmovdqa	%ymm6,576-512(%rax)
1085	vmovdqa	%ymm7,608-512(%rax)
1086
1087	jmp	.Loop_enter8x
1088
1089.align	32
1090.Loop_outer8x:
1091	vmovdqa	128-256(%rcx),%ymm8
1092	vmovdqa	160-256(%rcx),%ymm9
1093	vmovdqa	192-256(%rcx),%ymm10
1094	vmovdqa	224-256(%rcx),%ymm11
1095	vmovdqa	256-256(%rcx),%ymm0
1096	vmovdqa	288-256(%rcx),%ymm1
1097	vmovdqa	320-256(%rcx),%ymm2
1098	vmovdqa	352-256(%rcx),%ymm3
1099	vmovdqa	384-512(%rax),%ymm12
1100	vmovdqa	416-512(%rax),%ymm13
1101	vmovdqa	448-512(%rax),%ymm14
1102	vmovdqa	480-512(%rax),%ymm15
1103	vmovdqa	512-512(%rax),%ymm4
1104	vmovdqa	544-512(%rax),%ymm5
1105	vmovdqa	576-512(%rax),%ymm6
1106	vmovdqa	608-512(%rax),%ymm7
1107	vpaddd	.Leight(%rip),%ymm4,%ymm4
1108
1109.Loop_enter8x:
1110	vmovdqa	%ymm14,64(%rsp)
1111	vmovdqa	%ymm15,96(%rsp)
1112	vbroadcasti128	(%r10),%ymm15
1113	vmovdqa	%ymm4,512-512(%rax)
1114	movl	$10,%eax
1115	jmp	.Loop8x
1116
1117.align	32
1118.Loop8x:
1119	vpaddd	%ymm0,%ymm8,%ymm8
1120	vpxor	%ymm4,%ymm8,%ymm4
1121	vpshufb	%ymm15,%ymm4,%ymm4
1122	vpaddd	%ymm1,%ymm9,%ymm9
1123	vpxor	%ymm5,%ymm9,%ymm5
1124	vpshufb	%ymm15,%ymm5,%ymm5
1125	vpaddd	%ymm4,%ymm12,%ymm12
1126	vpxor	%ymm0,%ymm12,%ymm0
1127	vpslld	$12,%ymm0,%ymm14
1128	vpsrld	$20,%ymm0,%ymm0
1129	vpor	%ymm0,%ymm14,%ymm0
1130	vbroadcasti128	(%r11),%ymm14
1131	vpaddd	%ymm5,%ymm13,%ymm13
1132	vpxor	%ymm1,%ymm13,%ymm1
1133	vpslld	$12,%ymm1,%ymm15
1134	vpsrld	$20,%ymm1,%ymm1
1135	vpor	%ymm1,%ymm15,%ymm1
1136	vpaddd	%ymm0,%ymm8,%ymm8
1137	vpxor	%ymm4,%ymm8,%ymm4
1138	vpshufb	%ymm14,%ymm4,%ymm4
1139	vpaddd	%ymm1,%ymm9,%ymm9
1140	vpxor	%ymm5,%ymm9,%ymm5
1141	vpshufb	%ymm14,%ymm5,%ymm5
1142	vpaddd	%ymm4,%ymm12,%ymm12
1143	vpxor	%ymm0,%ymm12,%ymm0
1144	vpslld	$7,%ymm0,%ymm15
1145	vpsrld	$25,%ymm0,%ymm0
1146	vpor	%ymm0,%ymm15,%ymm0
1147	vbroadcasti128	(%r10),%ymm15
1148	vpaddd	%ymm5,%ymm13,%ymm13
1149	vpxor	%ymm1,%ymm13,%ymm1
1150	vpslld	$7,%ymm1,%ymm14
1151	vpsrld	$25,%ymm1,%ymm1
1152	vpor	%ymm1,%ymm14,%ymm1
1153	vmovdqa	%ymm12,0(%rsp)
1154	vmovdqa	%ymm13,32(%rsp)
1155	vmovdqa	64(%rsp),%ymm12
1156	vmovdqa	96(%rsp),%ymm13
1157	vpaddd	%ymm2,%ymm10,%ymm10
1158	vpxor	%ymm6,%ymm10,%ymm6
1159	vpshufb	%ymm15,%ymm6,%ymm6
1160	vpaddd	%ymm3,%ymm11,%ymm11
1161	vpxor	%ymm7,%ymm11,%ymm7
1162	vpshufb	%ymm15,%ymm7,%ymm7
1163	vpaddd	%ymm6,%ymm12,%ymm12
1164	vpxor	%ymm2,%ymm12,%ymm2
1165	vpslld	$12,%ymm2,%ymm14
1166	vpsrld	$20,%ymm2,%ymm2
1167	vpor	%ymm2,%ymm14,%ymm2
1168	vbroadcasti128	(%r11),%ymm14
1169	vpaddd	%ymm7,%ymm13,%ymm13
1170	vpxor	%ymm3,%ymm13,%ymm3
1171	vpslld	$12,%ymm3,%ymm15
1172	vpsrld	$20,%ymm3,%ymm3
1173	vpor	%ymm3,%ymm15,%ymm3
1174	vpaddd	%ymm2,%ymm10,%ymm10
1175	vpxor	%ymm6,%ymm10,%ymm6
1176	vpshufb	%ymm14,%ymm6,%ymm6
1177	vpaddd	%ymm3,%ymm11,%ymm11
1178	vpxor	%ymm7,%ymm11,%ymm7
1179	vpshufb	%ymm14,%ymm7,%ymm7
1180	vpaddd	%ymm6,%ymm12,%ymm12
1181	vpxor	%ymm2,%ymm12,%ymm2
1182	vpslld	$7,%ymm2,%ymm15
1183	vpsrld	$25,%ymm2,%ymm2
1184	vpor	%ymm2,%ymm15,%ymm2
1185	vbroadcasti128	(%r10),%ymm15
1186	vpaddd	%ymm7,%ymm13,%ymm13
1187	vpxor	%ymm3,%ymm13,%ymm3
1188	vpslld	$7,%ymm3,%ymm14
1189	vpsrld	$25,%ymm3,%ymm3
1190	vpor	%ymm3,%ymm14,%ymm3
1191	vpaddd	%ymm1,%ymm8,%ymm8
1192	vpxor	%ymm7,%ymm8,%ymm7
1193	vpshufb	%ymm15,%ymm7,%ymm7
1194	vpaddd	%ymm2,%ymm9,%ymm9
1195	vpxor	%ymm4,%ymm9,%ymm4
1196	vpshufb	%ymm15,%ymm4,%ymm4
1197	vpaddd	%ymm7,%ymm12,%ymm12
1198	vpxor	%ymm1,%ymm12,%ymm1
1199	vpslld	$12,%ymm1,%ymm14
1200	vpsrld	$20,%ymm1,%ymm1
1201	vpor	%ymm1,%ymm14,%ymm1
1202	vbroadcasti128	(%r11),%ymm14
1203	vpaddd	%ymm4,%ymm13,%ymm13
1204	vpxor	%ymm2,%ymm13,%ymm2
1205	vpslld	$12,%ymm2,%ymm15
1206	vpsrld	$20,%ymm2,%ymm2
1207	vpor	%ymm2,%ymm15,%ymm2
1208	vpaddd	%ymm1,%ymm8,%ymm8
1209	vpxor	%ymm7,%ymm8,%ymm7
1210	vpshufb	%ymm14,%ymm7,%ymm7
1211	vpaddd	%ymm2,%ymm9,%ymm9
1212	vpxor	%ymm4,%ymm9,%ymm4
1213	vpshufb	%ymm14,%ymm4,%ymm4
1214	vpaddd	%ymm7,%ymm12,%ymm12
1215	vpxor	%ymm1,%ymm12,%ymm1
1216	vpslld	$7,%ymm1,%ymm15
1217	vpsrld	$25,%ymm1,%ymm1
1218	vpor	%ymm1,%ymm15,%ymm1
1219	vbroadcasti128	(%r10),%ymm15
1220	vpaddd	%ymm4,%ymm13,%ymm13
1221	vpxor	%ymm2,%ymm13,%ymm2
1222	vpslld	$7,%ymm2,%ymm14
1223	vpsrld	$25,%ymm2,%ymm2
1224	vpor	%ymm2,%ymm14,%ymm2
1225	vmovdqa	%ymm12,64(%rsp)
1226	vmovdqa	%ymm13,96(%rsp)
1227	vmovdqa	0(%rsp),%ymm12
1228	vmovdqa	32(%rsp),%ymm13
1229	vpaddd	%ymm3,%ymm10,%ymm10
1230	vpxor	%ymm5,%ymm10,%ymm5
1231	vpshufb	%ymm15,%ymm5,%ymm5
1232	vpaddd	%ymm0,%ymm11,%ymm11
1233	vpxor	%ymm6,%ymm11,%ymm6
1234	vpshufb	%ymm15,%ymm6,%ymm6
1235	vpaddd	%ymm5,%ymm12,%ymm12
1236	vpxor	%ymm3,%ymm12,%ymm3
1237	vpslld	$12,%ymm3,%ymm14
1238	vpsrld	$20,%ymm3,%ymm3
1239	vpor	%ymm3,%ymm14,%ymm3
1240	vbroadcasti128	(%r11),%ymm14
1241	vpaddd	%ymm6,%ymm13,%ymm13
1242	vpxor	%ymm0,%ymm13,%ymm0
1243	vpslld	$12,%ymm0,%ymm15
1244	vpsrld	$20,%ymm0,%ymm0
1245	vpor	%ymm0,%ymm15,%ymm0
1246	vpaddd	%ymm3,%ymm10,%ymm10
1247	vpxor	%ymm5,%ymm10,%ymm5
1248	vpshufb	%ymm14,%ymm5,%ymm5
1249	vpaddd	%ymm0,%ymm11,%ymm11
1250	vpxor	%ymm6,%ymm11,%ymm6
1251	vpshufb	%ymm14,%ymm6,%ymm6
1252	vpaddd	%ymm5,%ymm12,%ymm12
1253	vpxor	%ymm3,%ymm12,%ymm3
1254	vpslld	$7,%ymm3,%ymm15
1255	vpsrld	$25,%ymm3,%ymm3
1256	vpor	%ymm3,%ymm15,%ymm3
1257	vbroadcasti128	(%r10),%ymm15
1258	vpaddd	%ymm6,%ymm13,%ymm13
1259	vpxor	%ymm0,%ymm13,%ymm0
1260	vpslld	$7,%ymm0,%ymm14
1261	vpsrld	$25,%ymm0,%ymm0
1262	vpor	%ymm0,%ymm14,%ymm0
1263	decl	%eax
1264	jnz	.Loop8x
1265
1266	leaq	512(%rsp),%rax
1267	vpaddd	128-256(%rcx),%ymm8,%ymm8
1268	vpaddd	160-256(%rcx),%ymm9,%ymm9
1269	vpaddd	192-256(%rcx),%ymm10,%ymm10
1270	vpaddd	224-256(%rcx),%ymm11,%ymm11
1271
1272	vpunpckldq	%ymm9,%ymm8,%ymm14
1273	vpunpckldq	%ymm11,%ymm10,%ymm15
1274	vpunpckhdq	%ymm9,%ymm8,%ymm8
1275	vpunpckhdq	%ymm11,%ymm10,%ymm10
1276	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1277	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1278	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1279	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1280	vpaddd	256-256(%rcx),%ymm0,%ymm0
1281	vpaddd	288-256(%rcx),%ymm1,%ymm1
1282	vpaddd	320-256(%rcx),%ymm2,%ymm2
1283	vpaddd	352-256(%rcx),%ymm3,%ymm3
1284
1285	vpunpckldq	%ymm1,%ymm0,%ymm10
1286	vpunpckldq	%ymm3,%ymm2,%ymm15
1287	vpunpckhdq	%ymm1,%ymm0,%ymm0
1288	vpunpckhdq	%ymm3,%ymm2,%ymm2
1289	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1290	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1291	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1292	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1293	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1294	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1295	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1296	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1297	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1298	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1299	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1300	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1301	vmovdqa	%ymm15,0(%rsp)
1302	vmovdqa	%ymm9,32(%rsp)
1303	vmovdqa	64(%rsp),%ymm15
1304	vmovdqa	96(%rsp),%ymm9
1305
1306	vpaddd	384-512(%rax),%ymm12,%ymm12
1307	vpaddd	416-512(%rax),%ymm13,%ymm13
1308	vpaddd	448-512(%rax),%ymm15,%ymm15
1309	vpaddd	480-512(%rax),%ymm9,%ymm9
1310
1311	vpunpckldq	%ymm13,%ymm12,%ymm2
1312	vpunpckldq	%ymm9,%ymm15,%ymm8
1313	vpunpckhdq	%ymm13,%ymm12,%ymm12
1314	vpunpckhdq	%ymm9,%ymm15,%ymm15
1315	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1316	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1317	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1318	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1319	vpaddd	512-512(%rax),%ymm4,%ymm4
1320	vpaddd	544-512(%rax),%ymm5,%ymm5
1321	vpaddd	576-512(%rax),%ymm6,%ymm6
1322	vpaddd	608-512(%rax),%ymm7,%ymm7
1323
1324	vpunpckldq	%ymm5,%ymm4,%ymm15
1325	vpunpckldq	%ymm7,%ymm6,%ymm8
1326	vpunpckhdq	%ymm5,%ymm4,%ymm4
1327	vpunpckhdq	%ymm7,%ymm6,%ymm6
1328	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1329	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1330	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1331	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1332	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1333	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1334	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1335	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1336	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1337	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1338	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1339	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1340	vmovdqa	0(%rsp),%ymm6
1341	vmovdqa	32(%rsp),%ymm12
1342
1343	cmpq	$512,%rdx
1344	jb	.Ltail8x
1345
1346	vpxor	0(%rsi),%ymm6,%ymm6
1347	vpxor	32(%rsi),%ymm8,%ymm8
1348	vpxor	64(%rsi),%ymm1,%ymm1
1349	vpxor	96(%rsi),%ymm5,%ymm5
1350	leaq	128(%rsi),%rsi
1351	vmovdqu	%ymm6,0(%rdi)
1352	vmovdqu	%ymm8,32(%rdi)
1353	vmovdqu	%ymm1,64(%rdi)
1354	vmovdqu	%ymm5,96(%rdi)
1355	leaq	128(%rdi),%rdi
1356
1357	vpxor	0(%rsi),%ymm12,%ymm12
1358	vpxor	32(%rsi),%ymm13,%ymm13
1359	vpxor	64(%rsi),%ymm10,%ymm10
1360	vpxor	96(%rsi),%ymm15,%ymm15
1361	leaq	128(%rsi),%rsi
1362	vmovdqu	%ymm12,0(%rdi)
1363	vmovdqu	%ymm13,32(%rdi)
1364	vmovdqu	%ymm10,64(%rdi)
1365	vmovdqu	%ymm15,96(%rdi)
1366	leaq	128(%rdi),%rdi
1367
1368	vpxor	0(%rsi),%ymm14,%ymm14
1369	vpxor	32(%rsi),%ymm2,%ymm2
1370	vpxor	64(%rsi),%ymm3,%ymm3
1371	vpxor	96(%rsi),%ymm7,%ymm7
1372	leaq	128(%rsi),%rsi
1373	vmovdqu	%ymm14,0(%rdi)
1374	vmovdqu	%ymm2,32(%rdi)
1375	vmovdqu	%ymm3,64(%rdi)
1376	vmovdqu	%ymm7,96(%rdi)
1377	leaq	128(%rdi),%rdi
1378
1379	vpxor	0(%rsi),%ymm11,%ymm11
1380	vpxor	32(%rsi),%ymm9,%ymm9
1381	vpxor	64(%rsi),%ymm0,%ymm0
1382	vpxor	96(%rsi),%ymm4,%ymm4
1383	leaq	128(%rsi),%rsi
1384	vmovdqu	%ymm11,0(%rdi)
1385	vmovdqu	%ymm9,32(%rdi)
1386	vmovdqu	%ymm0,64(%rdi)
1387	vmovdqu	%ymm4,96(%rdi)
1388	leaq	128(%rdi),%rdi
1389
1390	subq	$512,%rdx
1391	jnz	.Loop_outer8x
1392
1393	jmp	.Ldone8x
1394
1395.Ltail8x:
1396	cmpq	$448,%rdx
1397	jae	.L448_or_more8x
1398	cmpq	$384,%rdx
1399	jae	.L384_or_more8x
1400	cmpq	$320,%rdx
1401	jae	.L320_or_more8x
1402	cmpq	$256,%rdx
1403	jae	.L256_or_more8x
1404	cmpq	$192,%rdx
1405	jae	.L192_or_more8x
1406	cmpq	$128,%rdx
1407	jae	.L128_or_more8x
1408	cmpq	$64,%rdx
1409	jae	.L64_or_more8x
1410
1411	xorq	%r10,%r10
1412	vmovdqa	%ymm6,0(%rsp)
1413	vmovdqa	%ymm8,32(%rsp)
1414	jmp	.Loop_tail8x
1415
1416.align	32
1417.L64_or_more8x:
1418	vpxor	0(%rsi),%ymm6,%ymm6
1419	vpxor	32(%rsi),%ymm8,%ymm8
1420	vmovdqu	%ymm6,0(%rdi)
1421	vmovdqu	%ymm8,32(%rdi)
1422	je	.Ldone8x
1423
1424	leaq	64(%rsi),%rsi
1425	xorq	%r10,%r10
1426	vmovdqa	%ymm1,0(%rsp)
1427	leaq	64(%rdi),%rdi
1428	subq	$64,%rdx
1429	vmovdqa	%ymm5,32(%rsp)
1430	jmp	.Loop_tail8x
1431
1432.align	32
1433.L128_or_more8x:
1434	vpxor	0(%rsi),%ymm6,%ymm6
1435	vpxor	32(%rsi),%ymm8,%ymm8
1436	vpxor	64(%rsi),%ymm1,%ymm1
1437	vpxor	96(%rsi),%ymm5,%ymm5
1438	vmovdqu	%ymm6,0(%rdi)
1439	vmovdqu	%ymm8,32(%rdi)
1440	vmovdqu	%ymm1,64(%rdi)
1441	vmovdqu	%ymm5,96(%rdi)
1442	je	.Ldone8x
1443
1444	leaq	128(%rsi),%rsi
1445	xorq	%r10,%r10
1446	vmovdqa	%ymm12,0(%rsp)
1447	leaq	128(%rdi),%rdi
1448	subq	$128,%rdx
1449	vmovdqa	%ymm13,32(%rsp)
1450	jmp	.Loop_tail8x
1451
1452.align	32
1453.L192_or_more8x:
1454	vpxor	0(%rsi),%ymm6,%ymm6
1455	vpxor	32(%rsi),%ymm8,%ymm8
1456	vpxor	64(%rsi),%ymm1,%ymm1
1457	vpxor	96(%rsi),%ymm5,%ymm5
1458	vpxor	128(%rsi),%ymm12,%ymm12
1459	vpxor	160(%rsi),%ymm13,%ymm13
1460	vmovdqu	%ymm6,0(%rdi)
1461	vmovdqu	%ymm8,32(%rdi)
1462	vmovdqu	%ymm1,64(%rdi)
1463	vmovdqu	%ymm5,96(%rdi)
1464	vmovdqu	%ymm12,128(%rdi)
1465	vmovdqu	%ymm13,160(%rdi)
1466	je	.Ldone8x
1467
1468	leaq	192(%rsi),%rsi
1469	xorq	%r10,%r10
1470	vmovdqa	%ymm10,0(%rsp)
1471	leaq	192(%rdi),%rdi
1472	subq	$192,%rdx
1473	vmovdqa	%ymm15,32(%rsp)
1474	jmp	.Loop_tail8x
1475
1476.align	32
1477.L256_or_more8x:
1478	vpxor	0(%rsi),%ymm6,%ymm6
1479	vpxor	32(%rsi),%ymm8,%ymm8
1480	vpxor	64(%rsi),%ymm1,%ymm1
1481	vpxor	96(%rsi),%ymm5,%ymm5
1482	vpxor	128(%rsi),%ymm12,%ymm12
1483	vpxor	160(%rsi),%ymm13,%ymm13
1484	vpxor	192(%rsi),%ymm10,%ymm10
1485	vpxor	224(%rsi),%ymm15,%ymm15
1486	vmovdqu	%ymm6,0(%rdi)
1487	vmovdqu	%ymm8,32(%rdi)
1488	vmovdqu	%ymm1,64(%rdi)
1489	vmovdqu	%ymm5,96(%rdi)
1490	vmovdqu	%ymm12,128(%rdi)
1491	vmovdqu	%ymm13,160(%rdi)
1492	vmovdqu	%ymm10,192(%rdi)
1493	vmovdqu	%ymm15,224(%rdi)
1494	je	.Ldone8x
1495
1496	leaq	256(%rsi),%rsi
1497	xorq	%r10,%r10
1498	vmovdqa	%ymm14,0(%rsp)
1499	leaq	256(%rdi),%rdi
1500	subq	$256,%rdx
1501	vmovdqa	%ymm2,32(%rsp)
1502	jmp	.Loop_tail8x
1503
1504.align	32
1505.L320_or_more8x:
1506	vpxor	0(%rsi),%ymm6,%ymm6
1507	vpxor	32(%rsi),%ymm8,%ymm8
1508	vpxor	64(%rsi),%ymm1,%ymm1
1509	vpxor	96(%rsi),%ymm5,%ymm5
1510	vpxor	128(%rsi),%ymm12,%ymm12
1511	vpxor	160(%rsi),%ymm13,%ymm13
1512	vpxor	192(%rsi),%ymm10,%ymm10
1513	vpxor	224(%rsi),%ymm15,%ymm15
1514	vpxor	256(%rsi),%ymm14,%ymm14
1515	vpxor	288(%rsi),%ymm2,%ymm2
1516	vmovdqu	%ymm6,0(%rdi)
1517	vmovdqu	%ymm8,32(%rdi)
1518	vmovdqu	%ymm1,64(%rdi)
1519	vmovdqu	%ymm5,96(%rdi)
1520	vmovdqu	%ymm12,128(%rdi)
1521	vmovdqu	%ymm13,160(%rdi)
1522	vmovdqu	%ymm10,192(%rdi)
1523	vmovdqu	%ymm15,224(%rdi)
1524	vmovdqu	%ymm14,256(%rdi)
1525	vmovdqu	%ymm2,288(%rdi)
1526	je	.Ldone8x
1527
1528	leaq	320(%rsi),%rsi
1529	xorq	%r10,%r10
1530	vmovdqa	%ymm3,0(%rsp)
1531	leaq	320(%rdi),%rdi
1532	subq	$320,%rdx
1533	vmovdqa	%ymm7,32(%rsp)
1534	jmp	.Loop_tail8x
1535
1536.align	32
1537.L384_or_more8x:
1538	vpxor	0(%rsi),%ymm6,%ymm6
1539	vpxor	32(%rsi),%ymm8,%ymm8
1540	vpxor	64(%rsi),%ymm1,%ymm1
1541	vpxor	96(%rsi),%ymm5,%ymm5
1542	vpxor	128(%rsi),%ymm12,%ymm12
1543	vpxor	160(%rsi),%ymm13,%ymm13
1544	vpxor	192(%rsi),%ymm10,%ymm10
1545	vpxor	224(%rsi),%ymm15,%ymm15
1546	vpxor	256(%rsi),%ymm14,%ymm14
1547	vpxor	288(%rsi),%ymm2,%ymm2
1548	vpxor	320(%rsi),%ymm3,%ymm3
1549	vpxor	352(%rsi),%ymm7,%ymm7
1550	vmovdqu	%ymm6,0(%rdi)
1551	vmovdqu	%ymm8,32(%rdi)
1552	vmovdqu	%ymm1,64(%rdi)
1553	vmovdqu	%ymm5,96(%rdi)
1554	vmovdqu	%ymm12,128(%rdi)
1555	vmovdqu	%ymm13,160(%rdi)
1556	vmovdqu	%ymm10,192(%rdi)
1557	vmovdqu	%ymm15,224(%rdi)
1558	vmovdqu	%ymm14,256(%rdi)
1559	vmovdqu	%ymm2,288(%rdi)
1560	vmovdqu	%ymm3,320(%rdi)
1561	vmovdqu	%ymm7,352(%rdi)
1562	je	.Ldone8x
1563
1564	leaq	384(%rsi),%rsi
1565	xorq	%r10,%r10
1566	vmovdqa	%ymm11,0(%rsp)
1567	leaq	384(%rdi),%rdi
1568	subq	$384,%rdx
1569	vmovdqa	%ymm9,32(%rsp)
1570	jmp	.Loop_tail8x
1571
1572.align	32
1573.L448_or_more8x:
1574	vpxor	0(%rsi),%ymm6,%ymm6
1575	vpxor	32(%rsi),%ymm8,%ymm8
1576	vpxor	64(%rsi),%ymm1,%ymm1
1577	vpxor	96(%rsi),%ymm5,%ymm5
1578	vpxor	128(%rsi),%ymm12,%ymm12
1579	vpxor	160(%rsi),%ymm13,%ymm13
1580	vpxor	192(%rsi),%ymm10,%ymm10
1581	vpxor	224(%rsi),%ymm15,%ymm15
1582	vpxor	256(%rsi),%ymm14,%ymm14
1583	vpxor	288(%rsi),%ymm2,%ymm2
1584	vpxor	320(%rsi),%ymm3,%ymm3
1585	vpxor	352(%rsi),%ymm7,%ymm7
1586	vpxor	384(%rsi),%ymm11,%ymm11
1587	vpxor	416(%rsi),%ymm9,%ymm9
1588	vmovdqu	%ymm6,0(%rdi)
1589	vmovdqu	%ymm8,32(%rdi)
1590	vmovdqu	%ymm1,64(%rdi)
1591	vmovdqu	%ymm5,96(%rdi)
1592	vmovdqu	%ymm12,128(%rdi)
1593	vmovdqu	%ymm13,160(%rdi)
1594	vmovdqu	%ymm10,192(%rdi)
1595	vmovdqu	%ymm15,224(%rdi)
1596	vmovdqu	%ymm14,256(%rdi)
1597	vmovdqu	%ymm2,288(%rdi)
1598	vmovdqu	%ymm3,320(%rdi)
1599	vmovdqu	%ymm7,352(%rdi)
1600	vmovdqu	%ymm11,384(%rdi)
1601	vmovdqu	%ymm9,416(%rdi)
1602	je	.Ldone8x
1603
1604	leaq	448(%rsi),%rsi
1605	xorq	%r10,%r10
1606	vmovdqa	%ymm0,0(%rsp)
1607	leaq	448(%rdi),%rdi
1608	subq	$448,%rdx
1609	vmovdqa	%ymm4,32(%rsp)
1610
1611.Loop_tail8x:
1612	movzbl	(%rsi,%r10,1),%eax
1613	movzbl	(%rsp,%r10,1),%ecx
1614	leaq	1(%r10),%r10
1615	xorl	%ecx,%eax
1616	movb	%al,-1(%rdi,%r10,1)
1617	decq	%rdx
1618	jnz	.Loop_tail8x
1619
1620.Ldone8x:
1621	vzeroall
1622	leaq	(%r9),%rsp
1623.cfi_def_cfa_register	rsp
1624.L8x_epilogue:
1625	ret
1626.cfi_endproc
1627.size	ChaCha20_8x,.-ChaCha20_8x
1628#endif
1629