1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37$flavour = shift;
38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41if ($flavour && $flavour ne "void") {
42    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45    die "can't locate arm-xlate.pl";
46
47    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
48    *STDOUT=*OUT;
49} else {
50    open OUT,">$output";
51    *STDOUT=*OUT;
52}
53
54sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56  my $arg = pop;
57    $arg = "#$arg" if ($arg*1 eq $arg);
58    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59}
60
61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
62my @t=map("r$_",(8..11));
63
64sub ROUND {
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my $odd = $d0&1;
70my ($xc,$xc_) = (@t[0..1]);
71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
72my @ret;
73
74	# Consider order in which variables are addressed by their
75	# index:
76	#
77	#       a   b   c   d
78	#
79	#       0   4   8  12 < even round
80	#       1   5   9  13
81	#       2   6  10  14
82	#       3   7  11  15
83	#       0   5  10  15 < odd round
84	#       1   6  11  12
85	#       2   7   8  13
86	#       3   4   9  14
87	#
88	# 'a', 'b' are permanently allocated in registers, @x[0..7],
89	# while 'c's and pair of 'd's are maintained in memory. If
90	# you observe 'c' column, you'll notice that pair of 'c's is
91	# invariant between rounds. This means that we have to reload
92	# them once per round, in the middle. This is why you'll see
93	# bunch of 'c' stores and loads in the middle, but none in
94	# the beginning or end. If you observe 'd' column, you'll
95	# notice that 15 and 13 are reused in next pair of rounds.
96	# This is why these two are chosen for offloading to memory,
97	# to make loads count more.
98							push @ret,(
99	"&add	(@x[$a0],@x[$a0],@x[$b0])",
100	"&mov	($xd,$xd,'ror#16')",
101	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
102	 "&mov	($xd_,$xd_,'ror#16')",
103	"&eor	($xd,$xd,@x[$a0],'ror#16')",
104	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
105
106	"&add	($xc,$xc,$xd)",
107	"&mov	(@x[$b0],@x[$b0],'ror#20')",
108	 "&add	($xc_,$xc_,$xd_)",
109	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
110	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
111	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
112
113	"&add	(@x[$a0],@x[$a0],@x[$b0])",
114	"&mov	($xd,$xd,'ror#24')",
115	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
116	 "&mov	($xd_,$xd_,'ror#24')",
117	"&eor	($xd,$xd,@x[$a0],'ror#24')",
118	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
119
120	"&add	($xc,$xc,$xd)",
121	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
122							push @ret,(
123	"&str	($xd,'[sp,#4*(16+$d0)]')",
124	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
125							push @ret,(
126	 "&add	($xc_,$xc_,$xd_)",
127	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
128							push @ret,(
129	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
130	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
131							push @ret,(
132	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
133	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
134
135	$xd=@x[$d2]					if (!$odd);
136	$xd_=@x[$d3]					if ($odd);
137							push @ret,(
138	"&str	($xc,'[sp,#4*(16+$c0)]')",
139	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
140	"&add	(@x[$a2],@x[$a2],@x[$b2])",
141	"&mov	($xd,$xd,'ror#16')",
142	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
143	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
144	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
145	 "&mov	($xd_,$xd_,'ror#16')",
146	"&eor	($xd,$xd,@x[$a2],'ror#16')",
147	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
148
149	"&add	($xc,$xc,$xd)",
150	"&mov	(@x[$b2],@x[$b2],'ror#20')",
151	 "&add	($xc_,$xc_,$xd_)",
152	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
153	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
154	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
155
156	"&add	(@x[$a2],@x[$a2],@x[$b2])",
157	"&mov	($xd,$xd,'ror#24')",
158	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
159	 "&mov	($xd_,$xd_,'ror#24')",
160	"&eor	($xd,$xd,@x[$a2],'ror#24')",
161	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
162
163	"&add	($xc,$xc,$xd)",
164	"&mov	(@x[$b2],@x[$b2],'ror#25')",
165	 "&add	($xc_,$xc_,$xd_)",
166	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
167	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
168	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
169
170	@ret;
171}
172
173$code.=<<___;
174#include <ring-core/arm_arch.h>
175
176@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
177@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
178.arch  armv7-a
179
180.text
181#if defined(__thumb2__) || defined(__clang__)
182.syntax	unified
183#endif
184#if defined(__thumb2__)
185.thumb
186#else
187.code	32
188#endif
189
190#if defined(__thumb2__) || defined(__clang__)
191#define ldrhsb	ldrbhs
192#endif
193
194.align	5
195.Lsigma:
196.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
197.Lone:
198.long	1,0,0,0
199#if __ARM_MAX_ARCH__>=7
200.extern OPENSSL_armcap_P
201.hidden OPENSSL_armcap_P
202.LOPENSSL_armcap:
203.word   OPENSSL_armcap_P-.LChaCha20_ctr32
204#else
205.word	-1
206#endif
207
208.globl	ChaCha20_ctr32
209.type	ChaCha20_ctr32,%function
210.align	5
211ChaCha20_ctr32:
212.LChaCha20_ctr32:
213	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
214	stmdb	sp!,{r0-r2,r4-r11,lr}
215#if __ARM_ARCH__<7 && !defined(__thumb2__)
216	sub	r14,pc,#16		@ ChaCha20_ctr32
217#else
218	adr	r14,.LChaCha20_ctr32
219#endif
220	cmp	r2,#0			@ len==0?
221#ifdef	__thumb2__
222	itt	eq
223#endif
224	addeq	sp,sp,#4*3
225	beq	.Lno_data
226#if __ARM_MAX_ARCH__>=7
227	cmp	r2,#192			@ test len
228	bls	.Lshort
229	ldr	r4,[r14,#-32]
230	ldr	r4,[r14,r4]
231# ifdef	__APPLE__
232	ldr	r4,[r4]
233# endif
234	tst	r4,#ARMV7_NEON
235	bne	.LChaCha20_neon
236.Lshort:
237#endif
238	ldmia	r12,{r4-r7}		@ load counter and nonce
239	sub	sp,sp,#4*(16)		@ off-load area
240	sub	r14,r14,#64		@ .Lsigma
241	stmdb	sp!,{r4-r7}		@ copy counter and nonce
242	ldmia	r3,{r4-r11}		@ load key
243	ldmia	r14,{r0-r3}		@ load sigma
244	stmdb	sp!,{r4-r11}		@ copy key
245	stmdb	sp!,{r0-r3}		@ copy sigma
246	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
247	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
248	b	.Loop_outer_enter
249
250.align	4
251.Loop_outer:
252	ldmia	sp,{r0-r9}		@ load key material
253	str	@t[3],[sp,#4*(32+2)]	@ save len
254	str	r12,  [sp,#4*(32+1)]	@ save inp
255	str	r14,  [sp,#4*(32+0)]	@ save out
256.Loop_outer_enter:
257	ldr	@t[3], [sp,#4*(15)]
258	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
259	ldr	@t[2], [sp,#4*(13)]
260	ldr	@x[14],[sp,#4*(14)]
261	str	@t[3], [sp,#4*(16+15)]
262	mov	@t[3],#10
263	b	.Loop
264
265.align	4
266.Loop:
267	subs	@t[3],@t[3],#1
268___
269	foreach (&ROUND(0, 4, 8,12)) { eval; }
270	foreach (&ROUND(0, 5,10,15)) { eval; }
271$code.=<<___;
272	bne	.Loop
273
274	ldr	@t[3],[sp,#4*(32+2)]	@ load len
275
276	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
277	str	@t[1], [sp,#4*(16+9)]
278	str	@x[12],[sp,#4*(16+12)]
279	str	@t[2], [sp,#4*(16+13)]
280	str	@x[14],[sp,#4*(16+14)]
281
282	@ at this point we have first half of 512-bit result in
283	@ @x[0-7] and second half at sp+4*(16+8)
284
285	cmp	@t[3],#64		@ done yet?
286#ifdef	__thumb2__
287	itete	lo
288#endif
289	addlo	r12,sp,#4*(0)		@ shortcut or ...
290	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
291	addlo	r14,sp,#4*(0)		@ shortcut or ...
292	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
293
294	ldr	@t[0],[sp,#4*(0)]	@ load key material
295	ldr	@t[1],[sp,#4*(1)]
296
297#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
298# if __ARM_ARCH__<7
299	orr	@t[2],r12,r14
300	tst	@t[2],#3		@ are input and output aligned?
301	ldr	@t[2],[sp,#4*(2)]
302	bne	.Lunaligned
303	cmp	@t[3],#64		@ restore flags
304# else
305	ldr	@t[2],[sp,#4*(2)]
306# endif
307	ldr	@t[3],[sp,#4*(3)]
308
309	add	@x[0],@x[0],@t[0]	@ accumulate key material
310	add	@x[1],@x[1],@t[1]
311# ifdef	__thumb2__
312	itt	hs
313# endif
314	ldrhs	@t[0],[r12],#16		@ load input
315	ldrhs	@t[1],[r12,#-12]
316
317	add	@x[2],@x[2],@t[2]
318	add	@x[3],@x[3],@t[3]
319# ifdef	__thumb2__
320	itt	hs
321# endif
322	ldrhs	@t[2],[r12,#-8]
323	ldrhs	@t[3],[r12,#-4]
324# if __ARM_ARCH__>=6 && defined(__ARMEB__)
325	rev	@x[0],@x[0]
326	rev	@x[1],@x[1]
327	rev	@x[2],@x[2]
328	rev	@x[3],@x[3]
329# endif
330# ifdef	__thumb2__
331	itt	hs
332# endif
333	eorhs	@x[0],@x[0],@t[0]	@ xor with input
334	eorhs	@x[1],@x[1],@t[1]
335	 add	@t[0],sp,#4*(4)
336	str	@x[0],[r14],#16		@ store output
337# ifdef	__thumb2__
338	itt	hs
339# endif
340	eorhs	@x[2],@x[2],@t[2]
341	eorhs	@x[3],@x[3],@t[3]
342	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
343	str	@x[1],[r14,#-12]
344	str	@x[2],[r14,#-8]
345	str	@x[3],[r14,#-4]
346
347	add	@x[4],@x[4],@t[0]	@ accumulate key material
348	add	@x[5],@x[5],@t[1]
349# ifdef	__thumb2__
350	itt	hs
351# endif
352	ldrhs	@t[0],[r12],#16		@ load input
353	ldrhs	@t[1],[r12,#-12]
354	add	@x[6],@x[6],@t[2]
355	add	@x[7],@x[7],@t[3]
356# ifdef	__thumb2__
357	itt	hs
358# endif
359	ldrhs	@t[2],[r12,#-8]
360	ldrhs	@t[3],[r12,#-4]
361# if __ARM_ARCH__>=6 && defined(__ARMEB__)
362	rev	@x[4],@x[4]
363	rev	@x[5],@x[5]
364	rev	@x[6],@x[6]
365	rev	@x[7],@x[7]
366# endif
367# ifdef	__thumb2__
368	itt	hs
369# endif
370	eorhs	@x[4],@x[4],@t[0]
371	eorhs	@x[5],@x[5],@t[1]
372	 add	@t[0],sp,#4*(8)
373	str	@x[4],[r14],#16		@ store output
374# ifdef	__thumb2__
375	itt	hs
376# endif
377	eorhs	@x[6],@x[6],@t[2]
378	eorhs	@x[7],@x[7],@t[3]
379	str	@x[5],[r14,#-12]
380	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
381	str	@x[6],[r14,#-8]
382	 add	@x[0],sp,#4*(16+8)
383	str	@x[7],[r14,#-4]
384
385	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
386
387	add	@x[0],@x[0],@t[0]	@ accumulate key material
388	add	@x[1],@x[1],@t[1]
389# ifdef	__thumb2__
390	itt	hs
391# endif
392	ldrhs	@t[0],[r12],#16		@ load input
393	ldrhs	@t[1],[r12,#-12]
394# ifdef	__thumb2__
395	itt	hi
396# endif
397	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
398	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
399	add	@x[2],@x[2],@t[2]
400	add	@x[3],@x[3],@t[3]
401# ifdef	__thumb2__
402	itt	hs
403# endif
404	ldrhs	@t[2],[r12,#-8]
405	ldrhs	@t[3],[r12,#-4]
406# if __ARM_ARCH__>=6 && defined(__ARMEB__)
407	rev	@x[0],@x[0]
408	rev	@x[1],@x[1]
409	rev	@x[2],@x[2]
410	rev	@x[3],@x[3]
411# endif
412# ifdef	__thumb2__
413	itt	hs
414# endif
415	eorhs	@x[0],@x[0],@t[0]
416	eorhs	@x[1],@x[1],@t[1]
417	 add	@t[0],sp,#4*(12)
418	str	@x[0],[r14],#16		@ store output
419# ifdef	__thumb2__
420	itt	hs
421# endif
422	eorhs	@x[2],@x[2],@t[2]
423	eorhs	@x[3],@x[3],@t[3]
424	str	@x[1],[r14,#-12]
425	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
426	str	@x[2],[r14,#-8]
427	str	@x[3],[r14,#-4]
428
429	add	@x[4],@x[4],@t[0]	@ accumulate key material
430	add	@x[5],@x[5],@t[1]
431# ifdef	__thumb2__
432	itt	hi
433# endif
434	 addhi	@t[0],@t[0],#1		@ next counter value
435	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
436# ifdef	__thumb2__
437	itt	hs
438# endif
439	ldrhs	@t[0],[r12],#16		@ load input
440	ldrhs	@t[1],[r12,#-12]
441	add	@x[6],@x[6],@t[2]
442	add	@x[7],@x[7],@t[3]
443# ifdef	__thumb2__
444	itt	hs
445# endif
446	ldrhs	@t[2],[r12,#-8]
447	ldrhs	@t[3],[r12,#-4]
448# if __ARM_ARCH__>=6 && defined(__ARMEB__)
449	rev	@x[4],@x[4]
450	rev	@x[5],@x[5]
451	rev	@x[6],@x[6]
452	rev	@x[7],@x[7]
453# endif
454# ifdef	__thumb2__
455	itt	hs
456# endif
457	eorhs	@x[4],@x[4],@t[0]
458	eorhs	@x[5],@x[5],@t[1]
459# ifdef	__thumb2__
460	 it	ne
461# endif
462	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
463# ifdef	__thumb2__
464	itt	hs
465# endif
466	eorhs	@x[6],@x[6],@t[2]
467	eorhs	@x[7],@x[7],@t[3]
468	str	@x[4],[r14],#16		@ store output
469	str	@x[5],[r14,#-12]
470# ifdef	__thumb2__
471	it	hs
472# endif
473	 subhs	@t[3],@t[0],#64		@ len-=64
474	str	@x[6],[r14,#-8]
475	str	@x[7],[r14,#-4]
476	bhi	.Loop_outer
477
478	beq	.Ldone
479# if __ARM_ARCH__<7
480	b	.Ltail
481
482.align	4
483.Lunaligned:				@ unaligned endian-neutral path
484	cmp	@t[3],#64		@ restore flags
485# endif
486#endif
487#if __ARM_ARCH__<7
488	ldr	@t[3],[sp,#4*(3)]
489___
490for ($i=0;$i<16;$i+=4) {
491my $j=$i&0x7;
492
493$code.=<<___	if ($i==4);
494	add	@x[0],sp,#4*(16+8)
495___
496$code.=<<___	if ($i==8);
497	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
498# ifdef	__thumb2__
499	itt	hi
500# endif
501	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
502	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
503___
504$code.=<<___;
505	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
506___
507$code.=<<___	if ($i==12);
508# ifdef	__thumb2__
509	itt	hi
510# endif
511	addhi	@t[0],@t[0],#1			@ next counter value
512	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
513___
514$code.=<<___;
515	add	@x[$j+1],@x[$j+1],@t[1]
516	add	@x[$j+2],@x[$j+2],@t[2]
517# ifdef	__thumb2__
518	itete	lo
519# endif
520	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
521	ldrhsb	@t[0],[r12],#16			@ ... load input
522	eorlo	@t[1],@t[1],@t[1]
523	ldrhsb	@t[1],[r12,#-12]
524
525	add	@x[$j+3],@x[$j+3],@t[3]
526# ifdef	__thumb2__
527	itete	lo
528# endif
529	eorlo	@t[2],@t[2],@t[2]
530	ldrhsb	@t[2],[r12,#-8]
531	eorlo	@t[3],@t[3],@t[3]
532	ldrhsb	@t[3],[r12,#-4]
533
534	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
535	eor	@x[$j+1],@t[1],@x[$j+1]
536# ifdef	__thumb2__
537	itt	hs
538# endif
539	ldrhsb	@t[0],[r12,#-15]		@ load more input
540	ldrhsb	@t[1],[r12,#-11]
541	eor	@x[$j+2],@t[2],@x[$j+2]
542	 strb	@x[$j+0],[r14],#16		@ store output
543	eor	@x[$j+3],@t[3],@x[$j+3]
544# ifdef	__thumb2__
545	itt	hs
546# endif
547	ldrhsb	@t[2],[r12,#-7]
548	ldrhsb	@t[3],[r12,#-3]
549	 strb	@x[$j+1],[r14,#-12]
550	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
551	 strb	@x[$j+2],[r14,#-8]
552	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
553# ifdef	__thumb2__
554	itt	hs
555# endif
556	ldrhsb	@t[0],[r12,#-14]		@ load more input
557	ldrhsb	@t[1],[r12,#-10]
558	 strb	@x[$j+3],[r14,#-4]
559	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
560	 strb	@x[$j+0],[r14,#-15]
561	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
562# ifdef	__thumb2__
563	itt	hs
564# endif
565	ldrhsb	@t[2],[r12,#-6]
566	ldrhsb	@t[3],[r12,#-2]
567	 strb	@x[$j+1],[r14,#-11]
568	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
569	 strb	@x[$j+2],[r14,#-7]
570	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
571# ifdef	__thumb2__
572	itt	hs
573# endif
574	ldrhsb	@t[0],[r12,#-13]		@ load more input
575	ldrhsb	@t[1],[r12,#-9]
576	 strb	@x[$j+3],[r14,#-3]
577	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
578	 strb	@x[$j+0],[r14,#-14]
579	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
580# ifdef	__thumb2__
581	itt	hs
582# endif
583	ldrhsb	@t[2],[r12,#-5]
584	ldrhsb	@t[3],[r12,#-1]
585	 strb	@x[$j+1],[r14,#-10]
586	 strb	@x[$j+2],[r14,#-6]
587	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
588	 strb	@x[$j+3],[r14,#-2]
589	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
590	 strb	@x[$j+0],[r14,#-13]
591	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
592	 strb	@x[$j+1],[r14,#-9]
593	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
594	 strb	@x[$j+2],[r14,#-5]
595	 strb	@x[$j+3],[r14,#-1]
596___
597$code.=<<___	if ($i<12);
598	add	@t[0],sp,#4*(4+$i)
599	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
600___
601}
602$code.=<<___;
603# ifdef	__thumb2__
604	it	ne
605# endif
606	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
607# ifdef	__thumb2__
608	it	hs
609# endif
610	subhs	@t[3],@t[0],#64			@ len-=64
611	bhi	.Loop_outer
612
613	beq	.Ldone
614#endif
615
616.Ltail:
617	ldr	r12,[sp,#4*(32+1)]	@ load inp
618	add	@t[1],sp,#4*(0)
619	ldr	r14,[sp,#4*(32+0)]	@ load out
620
621.Loop_tail:
622	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
623	ldrb	@t[3],[r12],#1		@ read input
624	subs	@t[0],@t[0],#1
625	eor	@t[3],@t[3],@t[2]
626	strb	@t[3],[r14],#1		@ store output
627	bne	.Loop_tail
628
629.Ldone:
630	add	sp,sp,#4*(32+3)
631.Lno_data:
632	ldmia	sp!,{r4-r11,pc}
633.size	ChaCha20_ctr32,.-ChaCha20_ctr32
634___
635
636{{{
637my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
638    map("q$_",(0..15));
639
640sub NEONROUND {
641my $odd = pop;
642my ($a,$b,$c,$d,$t)=@_;
643
644	(
645	"&vadd_i32	($a,$a,$b)",
646	"&veor		($d,$d,$a)",
647	"&vrev32_16	($d,$d)",	# vrot ($d,16)
648
649	"&vadd_i32	($c,$c,$d)",
650	"&veor		($t,$b,$c)",
651	"&vshr_u32	($b,$t,20)",
652	"&vsli_32	($b,$t,12)",
653
654	"&vadd_i32	($a,$a,$b)",
655	"&veor		($t,$d,$a)",
656	"&vshr_u32	($d,$t,24)",
657	"&vsli_32	($d,$t,8)",
658
659	"&vadd_i32	($c,$c,$d)",
660	"&veor		($t,$b,$c)",
661	"&vshr_u32	($b,$t,25)",
662	"&vsli_32	($b,$t,7)",
663
664	"&vext_8	($c,$c,$c,8)",
665	"&vext_8	($b,$b,$b,$odd?12:4)",
666	"&vext_8	($d,$d,$d,$odd?4:12)"
667	);
668}
669
670$code.=<<___;
671#if __ARM_MAX_ARCH__>=7
672.arch	armv7-a
673.fpu	neon
674
675.type	ChaCha20_neon,%function
676.align	5
677ChaCha20_neon:
678	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
679	stmdb		sp!,{r0-r2,r4-r11,lr}
680.LChaCha20_neon:
681	adr		r14,.Lsigma
682	vstmdb		sp!,{d8-d15}		@ ABI spec says so
683	stmdb		sp!,{r0-r3}
684
685	vld1.32		{$b0-$c0},[r3]		@ load key
686	ldmia		r3,{r4-r11}		@ load key
687
688	sub		sp,sp,#4*(16+16)
689	vld1.32		{$d0},[r12]		@ load counter and nonce
690	add		r12,sp,#4*8
691	ldmia		r14,{r0-r3}		@ load sigma
692	vld1.32		{$a0},[r14]!		@ load sigma
693	vld1.32		{$t0},[r14]		@ one
694	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
695	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
696
697	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
698	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
699	vshl.i32	$t1#lo,$t0#lo,#1	@ two
700	vstr		$t0#lo,[sp,#4*(16+0)]
701	vshl.i32	$t2#lo,$t0#lo,#2	@ four
702	vstr		$t1#lo,[sp,#4*(16+2)]
703	vmov		$a1,$a0
704	vstr		$t2#lo,[sp,#4*(16+4)]
705	vmov		$a2,$a0
706	vmov		$b1,$b0
707	vmov		$b2,$b0
708	b		.Loop_neon_enter
709
710.align	4
711.Loop_neon_outer:
712	ldmia		sp,{r0-r9}		@ load key material
713	cmp		@t[3],#64*2		@ if len<=64*2
714	bls		.Lbreak_neon		@ switch to integer-only
715	vmov		$a1,$a0
716	str		@t[3],[sp,#4*(32+2)]	@ save len
717	vmov		$a2,$a0
718	str		r12,  [sp,#4*(32+1)]	@ save inp
719	vmov		$b1,$b0
720	str		r14,  [sp,#4*(32+0)]	@ save out
721	vmov		$b2,$b0
722.Loop_neon_enter:
723	ldr		@t[3], [sp,#4*(15)]
724	vadd.i32	$d1,$d0,$t0		@ counter+1
725	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
726	vmov		$c1,$c0
727	ldr		@t[2], [sp,#4*(13)]
728	vmov		$c2,$c0
729	ldr		@x[14],[sp,#4*(14)]
730	vadd.i32	$d2,$d1,$t0		@ counter+2
731	str		@t[3], [sp,#4*(16+15)]
732	mov		@t[3],#10
733	add		@x[12],@x[12],#3	@ counter+3
734	b		.Loop_neon
735
736.align	4
737.Loop_neon:
738	subs		@t[3],@t[3],#1
739___
740	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
741	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
742	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
743	my @thread3=&ROUND(0,4,8,12);
744
745	foreach (@thread0) {
746		eval;			eval(shift(@thread3));
747		eval(shift(@thread1));	eval(shift(@thread3));
748		eval(shift(@thread2));	eval(shift(@thread3));
749	}
750
751	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
752	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
753	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
754	@thread3=&ROUND(0,5,10,15);
755
756	foreach (@thread0) {
757		eval;			eval(shift(@thread3));
758		eval(shift(@thread1));	eval(shift(@thread3));
759		eval(shift(@thread2));	eval(shift(@thread3));
760	}
761$code.=<<___;
762	bne		.Loop_neon
763
764	add		@t[3],sp,#32
765	vld1.32		{$t0-$t1},[sp]		@ load key material
766	vld1.32		{$t2-$t3},[@t[3]]
767
768	ldr		@t[3],[sp,#4*(32+2)]	@ load len
769
770	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
771	str		@t[1], [sp,#4*(16+9)]
772	str		@x[12],[sp,#4*(16+12)]
773	str		@t[2], [sp,#4*(16+13)]
774	str		@x[14],[sp,#4*(16+14)]
775
776	@ at this point we have first half of 512-bit result in
777	@ @x[0-7] and second half at sp+4*(16+8)
778
779	ldr		r12,[sp,#4*(32+1)]	@ load inp
780	ldr		r14,[sp,#4*(32+0)]	@ load out
781
782	vadd.i32	$a0,$a0,$t0		@ accumulate key material
783	vadd.i32	$a1,$a1,$t0
784	vadd.i32	$a2,$a2,$t0
785	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
786
787	vadd.i32	$b0,$b0,$t1
788	vadd.i32	$b1,$b1,$t1
789	vadd.i32	$b2,$b2,$t1
790	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
791
792	vadd.i32	$c0,$c0,$t2
793	vadd.i32	$c1,$c1,$t2
794	vadd.i32	$c2,$c2,$t2
795	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
796	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
797
798	vadd.i32	$d0,$d0,$t3
799	vadd.i32	$d1,$d1,$t3
800	vadd.i32	$d2,$d2,$t3
801
802	cmp		@t[3],#64*4
803	blo		.Ltail_neon
804
805	vld1.8		{$t0-$t1},[r12]!	@ load input
806	 mov		@t[3],sp
807	vld1.8		{$t2-$t3},[r12]!
808	veor		$a0,$a0,$t0		@ xor with input
809	veor		$b0,$b0,$t1
810	vld1.8		{$t0-$t1},[r12]!
811	veor		$c0,$c0,$t2
812	veor		$d0,$d0,$t3
813	vld1.8		{$t2-$t3},[r12]!
814
815	veor		$a1,$a1,$t0
816	 vst1.8		{$a0-$b0},[r14]!	@ store output
817	veor		$b1,$b1,$t1
818	vld1.8		{$t0-$t1},[r12]!
819	veor		$c1,$c1,$t2
820	 vst1.8		{$c0-$d0},[r14]!
821	veor		$d1,$d1,$t3
822	vld1.8		{$t2-$t3},[r12]!
823
824	veor		$a2,$a2,$t0
825	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
826	 veor		$t0#hi,$t0#hi,$t0#hi
827	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
828	veor		$b2,$b2,$t1
829	 vld1.32	{$c0-$d0},[@t[3]]
830	veor		$c2,$c2,$t2
831	 vst1.8		{$a1-$b1},[r14]!
832	veor		$d2,$d2,$t3
833	 vst1.8		{$c1-$d1},[r14]!
834
835	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
836	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
837
838	ldmia		sp,{@t[0]-@t[3]}	@ load key material
839	add		@x[0],@x[0],@t[0]	@ accumulate key material
840	ldr		@t[0],[r12],#16		@ load input
841	 vst1.8		{$a2-$b2},[r14]!
842	add		@x[1],@x[1],@t[1]
843	ldr		@t[1],[r12,#-12]
844	 vst1.8		{$c2-$d2},[r14]!
845	add		@x[2],@x[2],@t[2]
846	ldr		@t[2],[r12,#-8]
847	add		@x[3],@x[3],@t[3]
848	ldr		@t[3],[r12,#-4]
849# ifdef	__ARMEB__
850	rev		@x[0],@x[0]
851	rev		@x[1],@x[1]
852	rev		@x[2],@x[2]
853	rev		@x[3],@x[3]
854# endif
855	eor		@x[0],@x[0],@t[0]	@ xor with input
856	 add		@t[0],sp,#4*(4)
857	eor		@x[1],@x[1],@t[1]
858	str		@x[0],[r14],#16		@ store output
859	eor		@x[2],@x[2],@t[2]
860	str		@x[1],[r14,#-12]
861	eor		@x[3],@x[3],@t[3]
862	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
863	str		@x[2],[r14,#-8]
864	str		@x[3],[r14,#-4]
865
866	add		@x[4],@x[4],@t[0]	@ accumulate key material
867	ldr		@t[0],[r12],#16		@ load input
868	add		@x[5],@x[5],@t[1]
869	ldr		@t[1],[r12,#-12]
870	add		@x[6],@x[6],@t[2]
871	ldr		@t[2],[r12,#-8]
872	add		@x[7],@x[7],@t[3]
873	ldr		@t[3],[r12,#-4]
874# ifdef	__ARMEB__
875	rev		@x[4],@x[4]
876	rev		@x[5],@x[5]
877	rev		@x[6],@x[6]
878	rev		@x[7],@x[7]
879# endif
880	eor		@x[4],@x[4],@t[0]
881	 add		@t[0],sp,#4*(8)
882	eor		@x[5],@x[5],@t[1]
883	str		@x[4],[r14],#16		@ store output
884	eor		@x[6],@x[6],@t[2]
885	str		@x[5],[r14,#-12]
886	eor		@x[7],@x[7],@t[3]
887	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
888	str		@x[6],[r14,#-8]
889	 add		@x[0],sp,#4*(16+8)
890	str		@x[7],[r14,#-4]
891
892	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
893
894	add		@x[0],@x[0],@t[0]	@ accumulate key material
895	ldr		@t[0],[r12],#16		@ load input
896	add		@x[1],@x[1],@t[1]
897	ldr		@t[1],[r12,#-12]
898# ifdef	__thumb2__
899	it	hi
900# endif
901	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
902	add		@x[2],@x[2],@t[2]
903	ldr		@t[2],[r12,#-8]
904# ifdef	__thumb2__
905	it	hi
906# endif
907	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
908	add		@x[3],@x[3],@t[3]
909	ldr		@t[3],[r12,#-4]
910# ifdef	__ARMEB__
911	rev		@x[0],@x[0]
912	rev		@x[1],@x[1]
913	rev		@x[2],@x[2]
914	rev		@x[3],@x[3]
915# endif
916	eor		@x[0],@x[0],@t[0]
917	 add		@t[0],sp,#4*(12)
918	eor		@x[1],@x[1],@t[1]
919	str		@x[0],[r14],#16		@ store output
920	eor		@x[2],@x[2],@t[2]
921	str		@x[1],[r14,#-12]
922	eor		@x[3],@x[3],@t[3]
923	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
924	str		@x[2],[r14,#-8]
925	str		@x[3],[r14,#-4]
926
927	add		@x[4],@x[4],@t[0]	@ accumulate key material
928	 add		@t[0],@t[0],#4		@ next counter value
929	add		@x[5],@x[5],@t[1]
930	 str		@t[0],[sp,#4*(12)]	@ save next counter value
931	ldr		@t[0],[r12],#16		@ load input
932	add		@x[6],@x[6],@t[2]
933	 add		@x[4],@x[4],#3		@ counter+3
934	ldr		@t[1],[r12,#-12]
935	add		@x[7],@x[7],@t[3]
936	ldr		@t[2],[r12,#-8]
937	ldr		@t[3],[r12,#-4]
938# ifdef	__ARMEB__
939	rev		@x[4],@x[4]
940	rev		@x[5],@x[5]
941	rev		@x[6],@x[6]
942	rev		@x[7],@x[7]
943# endif
944	eor		@x[4],@x[4],@t[0]
945# ifdef	__thumb2__
946	it	hi
947# endif
948	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
949	eor		@x[5],@x[5],@t[1]
950	eor		@x[6],@x[6],@t[2]
951	str		@x[4],[r14],#16		@ store output
952	eor		@x[7],@x[7],@t[3]
953	str		@x[5],[r14,#-12]
954	 sub		@t[3],@t[0],#64*4	@ len-=64*4
955	str		@x[6],[r14,#-8]
956	str		@x[7],[r14,#-4]
957	bhi		.Loop_neon_outer
958
959	b		.Ldone_neon
960
961.align	4
962.Lbreak_neon:
963	@ harmonize NEON and integer-only stack frames: load data
964	@ from NEON frame, but save to integer-only one; distance
965	@ between the two is 4*(32+4+16-32)=4*(20).
966
967	str		@t[3], [sp,#4*(20+32+2)]	@ save len
968	 add		@t[3],sp,#4*(32+4)
969	str		r12,   [sp,#4*(20+32+1)]	@ save inp
970	str		r14,   [sp,#4*(20+32+0)]	@ save out
971
972	ldr		@x[12],[sp,#4*(16+10)]
973	ldr		@x[14],[sp,#4*(16+11)]
974	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
975	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
976	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
977
978	ldr		@t[3], [sp,#4*(15)]
979	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
980	ldr		@t[2], [sp,#4*(13)]
981	ldr		@x[14],[sp,#4*(14)]
982	str		@t[3], [sp,#4*(20+16+15)]
983	add		@t[3],sp,#4*(20)
984	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
985	add		sp,sp,#4*(20)			@ switch frame
986	vst1.32		{$c0-$d0},[@t[3]]
987	mov		@t[3],#10
988	b		.Loop				@ go integer-only
989
990.align	4
991.Ltail_neon:
992	cmp		@t[3],#64*3
993	bhs		.L192_or_more_neon
994	cmp		@t[3],#64*2
995	bhs		.L128_or_more_neon
996	cmp		@t[3],#64*1
997	bhs		.L64_or_more_neon
998
999	add		@t[0],sp,#4*(8)
1000	vst1.8		{$a0-$b0},[sp]
1001	add		@t[2],sp,#4*(0)
1002	vst1.8		{$c0-$d0},[@t[0]]
1003	b		.Loop_tail_neon
1004
1005.align	4
1006.L64_or_more_neon:
1007	vld1.8		{$t0-$t1},[r12]!
1008	vld1.8		{$t2-$t3},[r12]!
1009	veor		$a0,$a0,$t0
1010	veor		$b0,$b0,$t1
1011	veor		$c0,$c0,$t2
1012	veor		$d0,$d0,$t3
1013	vst1.8		{$a0-$b0},[r14]!
1014	vst1.8		{$c0-$d0},[r14]!
1015
1016	beq		.Ldone_neon
1017
1018	add		@t[0],sp,#4*(8)
1019	vst1.8		{$a1-$b1},[sp]
1020	add		@t[2],sp,#4*(0)
1021	vst1.8		{$c1-$d1},[@t[0]]
1022	sub		@t[3],@t[3],#64*1	@ len-=64*1
1023	b		.Loop_tail_neon
1024
1025.align	4
1026.L128_or_more_neon:
1027	vld1.8		{$t0-$t1},[r12]!
1028	vld1.8		{$t2-$t3},[r12]!
1029	veor		$a0,$a0,$t0
1030	veor		$b0,$b0,$t1
1031	vld1.8		{$t0-$t1},[r12]!
1032	veor		$c0,$c0,$t2
1033	veor		$d0,$d0,$t3
1034	vld1.8		{$t2-$t3},[r12]!
1035
1036	veor		$a1,$a1,$t0
1037	veor		$b1,$b1,$t1
1038	 vst1.8		{$a0-$b0},[r14]!
1039	veor		$c1,$c1,$t2
1040	 vst1.8		{$c0-$d0},[r14]!
1041	veor		$d1,$d1,$t3
1042	vst1.8		{$a1-$b1},[r14]!
1043	vst1.8		{$c1-$d1},[r14]!
1044
1045	beq		.Ldone_neon
1046
1047	add		@t[0],sp,#4*(8)
1048	vst1.8		{$a2-$b2},[sp]
1049	add		@t[2],sp,#4*(0)
1050	vst1.8		{$c2-$d2},[@t[0]]
1051	sub		@t[3],@t[3],#64*2	@ len-=64*2
1052	b		.Loop_tail_neon
1053
1054.align	4
1055.L192_or_more_neon:
1056	vld1.8		{$t0-$t1},[r12]!
1057	vld1.8		{$t2-$t3},[r12]!
1058	veor		$a0,$a0,$t0
1059	veor		$b0,$b0,$t1
1060	vld1.8		{$t0-$t1},[r12]!
1061	veor		$c0,$c0,$t2
1062	veor		$d0,$d0,$t3
1063	vld1.8		{$t2-$t3},[r12]!
1064
1065	veor		$a1,$a1,$t0
1066	veor		$b1,$b1,$t1
1067	vld1.8		{$t0-$t1},[r12]!
1068	veor		$c1,$c1,$t2
1069	 vst1.8		{$a0-$b0},[r14]!
1070	veor		$d1,$d1,$t3
1071	vld1.8		{$t2-$t3},[r12]!
1072
1073	veor		$a2,$a2,$t0
1074	 vst1.8		{$c0-$d0},[r14]!
1075	veor		$b2,$b2,$t1
1076	 vst1.8		{$a1-$b1},[r14]!
1077	veor		$c2,$c2,$t2
1078	 vst1.8		{$c1-$d1},[r14]!
1079	veor		$d2,$d2,$t3
1080	vst1.8		{$a2-$b2},[r14]!
1081	vst1.8		{$c2-$d2},[r14]!
1082
1083	beq		.Ldone_neon
1084
1085	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1086	add		@x[0],@x[0],@t[0]	@ accumulate key material
1087	 add		@t[0],sp,#4*(4)
1088	add		@x[1],@x[1],@t[1]
1089	add		@x[2],@x[2],@t[2]
1090	add		@x[3],@x[3],@t[3]
1091	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1092
1093	add		@x[4],@x[4],@t[0]	@ accumulate key material
1094	 add		@t[0],sp,#4*(8)
1095	add		@x[5],@x[5],@t[1]
1096	add		@x[6],@x[6],@t[2]
1097	add		@x[7],@x[7],@t[3]
1098	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1099# ifdef	__ARMEB__
1100	rev		@x[0],@x[0]
1101	rev		@x[1],@x[1]
1102	rev		@x[2],@x[2]
1103	rev		@x[3],@x[3]
1104	rev		@x[4],@x[4]
1105	rev		@x[5],@x[5]
1106	rev		@x[6],@x[6]
1107	rev		@x[7],@x[7]
1108# endif
1109	stmia		sp,{@x[0]-@x[7]}
1110	 add		@x[0],sp,#4*(16+8)
1111
1112	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1113
1114	add		@x[0],@x[0],@t[0]	@ accumulate key material
1115	 add		@t[0],sp,#4*(12)
1116	add		@x[1],@x[1],@t[1]
1117	add		@x[2],@x[2],@t[2]
1118	add		@x[3],@x[3],@t[3]
1119	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1120
1121	add		@x[4],@x[4],@t[0]	@ accumulate key material
1122	 add		@t[0],sp,#4*(8)
1123	add		@x[5],@x[5],@t[1]
1124	 add		@x[4],@x[4],#3		@ counter+3
1125	add		@x[6],@x[6],@t[2]
1126	add		@x[7],@x[7],@t[3]
1127	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1128# ifdef	__ARMEB__
1129	rev		@x[0],@x[0]
1130	rev		@x[1],@x[1]
1131	rev		@x[2],@x[2]
1132	rev		@x[3],@x[3]
1133	rev		@x[4],@x[4]
1134	rev		@x[5],@x[5]
1135	rev		@x[6],@x[6]
1136	rev		@x[7],@x[7]
1137# endif
1138	stmia		@t[0],{@x[0]-@x[7]}
1139	 add		@t[2],sp,#4*(0)
1140	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1141
1142.Loop_tail_neon:
1143	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1144	ldrb		@t[1],[r12],#1		@ read input
1145	subs		@t[3],@t[3],#1
1146	eor		@t[0],@t[0],@t[1]
1147	strb		@t[0],[r14],#1		@ store output
1148	bne		.Loop_tail_neon
1149
1150.Ldone_neon:
1151	add		sp,sp,#4*(32+4)
1152	vldmia		sp,{d8-d15}
1153	add		sp,sp,#4*(16+3)
1154	ldmia		sp!,{r4-r11,pc}
1155.size	ChaCha20_neon,.-ChaCha20_neon
1156#endif
1157___
1158}}}
1159
1160foreach (split("\n",$code)) {
1161	s/\`([^\`]*)\`/eval $1/geo;
1162
1163	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1164
1165	print $_,"\n";
1166}
1167close STDOUT or die "error closing STDOUT: $!";
1168