xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/aesv8-armv8-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <openssl/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.section	.rodata
13.align	5
14.Lrcon:
15.long	0x01,0x01,0x01,0x01
16.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
17.long	0x1b,0x1b,0x1b,0x1b
18
19.text
20
21.globl	aes_hw_set_encrypt_key
22.hidden	aes_hw_set_encrypt_key
23.type	aes_hw_set_encrypt_key,%function
24.align	5
25aes_hw_set_encrypt_key:
26.Lenc_key:
27	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
28	AARCH64_VALID_CALL_TARGET
29	stp	x29,x30,[sp,#-16]!
30	add	x29,sp,#0
31	mov	x3,#-1
32	cmp	x0,#0
33	b.eq	.Lenc_key_abort
34	cmp	x2,#0
35	b.eq	.Lenc_key_abort
36	mov	x3,#-2
37	cmp	w1,#128
38	b.lt	.Lenc_key_abort
39	cmp	w1,#256
40	b.gt	.Lenc_key_abort
41	tst	w1,#0x3f
42	b.ne	.Lenc_key_abort
43
44	adrp	x3,.Lrcon
45	add	x3,x3,:lo12:.Lrcon
46	cmp	w1,#192
47
48	eor	v0.16b,v0.16b,v0.16b
49	ld1	{v3.16b},[x0],#16
50	mov	w1,#8		// reuse w1
51	ld1	{v1.4s,v2.4s},[x3],#32
52
53	b.lt	.Loop128
54	b.eq	.L192
55	b	.L256
56
57.align	4
58.Loop128:
59	tbl	v6.16b,{v3.16b},v2.16b
60	ext	v5.16b,v0.16b,v3.16b,#12
61	st1	{v3.4s},[x2],#16
62	aese	v6.16b,v0.16b
63	subs	w1,w1,#1
64
65	eor	v3.16b,v3.16b,v5.16b
66	ext	v5.16b,v0.16b,v5.16b,#12
67	eor	v3.16b,v3.16b,v5.16b
68	ext	v5.16b,v0.16b,v5.16b,#12
69	eor	v6.16b,v6.16b,v1.16b
70	eor	v3.16b,v3.16b,v5.16b
71	shl	v1.16b,v1.16b,#1
72	eor	v3.16b,v3.16b,v6.16b
73	b.ne	.Loop128
74
75	ld1	{v1.4s},[x3]
76
77	tbl	v6.16b,{v3.16b},v2.16b
78	ext	v5.16b,v0.16b,v3.16b,#12
79	st1	{v3.4s},[x2],#16
80	aese	v6.16b,v0.16b
81
82	eor	v3.16b,v3.16b,v5.16b
83	ext	v5.16b,v0.16b,v5.16b,#12
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v6.16b,v6.16b,v1.16b
87	eor	v3.16b,v3.16b,v5.16b
88	shl	v1.16b,v1.16b,#1
89	eor	v3.16b,v3.16b,v6.16b
90
91	tbl	v6.16b,{v3.16b},v2.16b
92	ext	v5.16b,v0.16b,v3.16b,#12
93	st1	{v3.4s},[x2],#16
94	aese	v6.16b,v0.16b
95
96	eor	v3.16b,v3.16b,v5.16b
97	ext	v5.16b,v0.16b,v5.16b,#12
98	eor	v3.16b,v3.16b,v5.16b
99	ext	v5.16b,v0.16b,v5.16b,#12
100	eor	v6.16b,v6.16b,v1.16b
101	eor	v3.16b,v3.16b,v5.16b
102	eor	v3.16b,v3.16b,v6.16b
103	st1	{v3.4s},[x2]
104	add	x2,x2,#0x50
105
106	mov	w12,#10
107	b	.Ldone
108
109.align	4
110.L192:
111	ld1	{v4.8b},[x0],#8
112	movi	v6.16b,#8			// borrow v6.16b
113	st1	{v3.4s},[x2],#16
114	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
115
116.Loop192:
117	tbl	v6.16b,{v4.16b},v2.16b
118	ext	v5.16b,v0.16b,v3.16b,#12
119	st1	{v4.8b},[x2],#8
120	aese	v6.16b,v0.16b
121	subs	w1,w1,#1
122
123	eor	v3.16b,v3.16b,v5.16b
124	ext	v5.16b,v0.16b,v5.16b,#12
125	eor	v3.16b,v3.16b,v5.16b
126	ext	v5.16b,v0.16b,v5.16b,#12
127	eor	v3.16b,v3.16b,v5.16b
128
129	dup	v5.4s,v3.s[3]
130	eor	v5.16b,v5.16b,v4.16b
131	eor	v6.16b,v6.16b,v1.16b
132	ext	v4.16b,v0.16b,v4.16b,#12
133	shl	v1.16b,v1.16b,#1
134	eor	v4.16b,v4.16b,v5.16b
135	eor	v3.16b,v3.16b,v6.16b
136	eor	v4.16b,v4.16b,v6.16b
137	st1	{v3.4s},[x2],#16
138	b.ne	.Loop192
139
140	mov	w12,#12
141	add	x2,x2,#0x20
142	b	.Ldone
143
144.align	4
145.L256:
146	ld1	{v4.16b},[x0]
147	mov	w1,#7
148	mov	w12,#14
149	st1	{v3.4s},[x2],#16
150
151.Loop256:
152	tbl	v6.16b,{v4.16b},v2.16b
153	ext	v5.16b,v0.16b,v3.16b,#12
154	st1	{v4.4s},[x2],#16
155	aese	v6.16b,v0.16b
156	subs	w1,w1,#1
157
158	eor	v3.16b,v3.16b,v5.16b
159	ext	v5.16b,v0.16b,v5.16b,#12
160	eor	v3.16b,v3.16b,v5.16b
161	ext	v5.16b,v0.16b,v5.16b,#12
162	eor	v6.16b,v6.16b,v1.16b
163	eor	v3.16b,v3.16b,v5.16b
164	shl	v1.16b,v1.16b,#1
165	eor	v3.16b,v3.16b,v6.16b
166	st1	{v3.4s},[x2],#16
167	b.eq	.Ldone
168
169	dup	v6.4s,v3.s[3]		// just splat
170	ext	v5.16b,v0.16b,v4.16b,#12
171	aese	v6.16b,v0.16b
172
173	eor	v4.16b,v4.16b,v5.16b
174	ext	v5.16b,v0.16b,v5.16b,#12
175	eor	v4.16b,v4.16b,v5.16b
176	ext	v5.16b,v0.16b,v5.16b,#12
177	eor	v4.16b,v4.16b,v5.16b
178
179	eor	v4.16b,v4.16b,v6.16b
180	b	.Loop256
181
182.Ldone:
183	str	w12,[x2]
184	mov	x3,#0
185
186.Lenc_key_abort:
187	mov	x0,x3			// return value
188	ldr	x29,[sp],#16
189	ret
190.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
191
192.globl	aes_hw_set_decrypt_key
193.hidden	aes_hw_set_decrypt_key
194.type	aes_hw_set_decrypt_key,%function
195.align	5
196aes_hw_set_decrypt_key:
197	AARCH64_SIGN_LINK_REGISTER
198	stp	x29,x30,[sp,#-16]!
199	add	x29,sp,#0
200	bl	.Lenc_key
201
202	cmp	x0,#0
203	b.ne	.Ldec_key_abort
204
205	sub	x2,x2,#240		// restore original x2
206	mov	x4,#-16
207	add	x0,x2,x12,lsl#4	// end of key schedule
208
209	ld1	{v0.4s},[x2]
210	ld1	{v1.4s},[x0]
211	st1	{v0.4s},[x0],x4
212	st1	{v1.4s},[x2],#16
213
214.Loop_imc:
215	ld1	{v0.4s},[x2]
216	ld1	{v1.4s},[x0]
217	aesimc	v0.16b,v0.16b
218	aesimc	v1.16b,v1.16b
219	st1	{v0.4s},[x0],x4
220	st1	{v1.4s},[x2],#16
221	cmp	x0,x2
222	b.hi	.Loop_imc
223
224	ld1	{v0.4s},[x2]
225	aesimc	v0.16b,v0.16b
226	st1	{v0.4s},[x0]
227
228	eor	x0,x0,x0		// return value
229.Ldec_key_abort:
230	ldp	x29,x30,[sp],#16
231	AARCH64_VALIDATE_LINK_REGISTER
232	ret
233.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
234.globl	aes_hw_encrypt
235.hidden	aes_hw_encrypt
236.type	aes_hw_encrypt,%function
237.align	5
238aes_hw_encrypt:
239	AARCH64_VALID_CALL_TARGET
240	ldr	w3,[x2,#240]
241	ld1	{v0.4s},[x2],#16
242	ld1	{v2.16b},[x0]
243	sub	w3,w3,#2
244	ld1	{v1.4s},[x2],#16
245
246.Loop_enc:
247	aese	v2.16b,v0.16b
248	aesmc	v2.16b,v2.16b
249	ld1	{v0.4s},[x2],#16
250	subs	w3,w3,#2
251	aese	v2.16b,v1.16b
252	aesmc	v2.16b,v2.16b
253	ld1	{v1.4s},[x2],#16
254	b.gt	.Loop_enc
255
256	aese	v2.16b,v0.16b
257	aesmc	v2.16b,v2.16b
258	ld1	{v0.4s},[x2]
259	aese	v2.16b,v1.16b
260	eor	v2.16b,v2.16b,v0.16b
261
262	st1	{v2.16b},[x1]
263	ret
264.size	aes_hw_encrypt,.-aes_hw_encrypt
265.globl	aes_hw_decrypt
266.hidden	aes_hw_decrypt
267.type	aes_hw_decrypt,%function
268.align	5
269aes_hw_decrypt:
270	AARCH64_VALID_CALL_TARGET
271	ldr	w3,[x2,#240]
272	ld1	{v0.4s},[x2],#16
273	ld1	{v2.16b},[x0]
274	sub	w3,w3,#2
275	ld1	{v1.4s},[x2],#16
276
277.Loop_dec:
278	aesd	v2.16b,v0.16b
279	aesimc	v2.16b,v2.16b
280	ld1	{v0.4s},[x2],#16
281	subs	w3,w3,#2
282	aesd	v2.16b,v1.16b
283	aesimc	v2.16b,v2.16b
284	ld1	{v1.4s},[x2],#16
285	b.gt	.Loop_dec
286
287	aesd	v2.16b,v0.16b
288	aesimc	v2.16b,v2.16b
289	ld1	{v0.4s},[x2]
290	aesd	v2.16b,v1.16b
291	eor	v2.16b,v2.16b,v0.16b
292
293	st1	{v2.16b},[x1]
294	ret
295.size	aes_hw_decrypt,.-aes_hw_decrypt
296.globl	aes_hw_cbc_encrypt
297.hidden	aes_hw_cbc_encrypt
298.type	aes_hw_cbc_encrypt,%function
299.align	5
300aes_hw_cbc_encrypt:
301	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
302	AARCH64_VALID_CALL_TARGET
303	stp	x29,x30,[sp,#-16]!
304	add	x29,sp,#0
305	subs	x2,x2,#16
306	mov	x8,#16
307	b.lo	.Lcbc_abort
308	csel	x8,xzr,x8,eq
309
310	cmp	w5,#0			// en- or decrypting?
311	ldr	w5,[x3,#240]
312	and	x2,x2,#-16
313	ld1	{v6.16b},[x4]
314	ld1	{v0.16b},[x0],x8
315
316	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
317	sub	w5,w5,#6
318	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
319	sub	w5,w5,#2
320	ld1	{v18.4s,v19.4s},[x7],#32
321	ld1	{v20.4s,v21.4s},[x7],#32
322	ld1	{v22.4s,v23.4s},[x7],#32
323	ld1	{v7.4s},[x7]
324
325	add	x7,x3,#32
326	mov	w6,w5
327	b.eq	.Lcbc_dec
328
329	cmp	w5,#2
330	eor	v0.16b,v0.16b,v6.16b
331	eor	v5.16b,v16.16b,v7.16b
332	b.eq	.Lcbc_enc128
333
334	ld1	{v2.4s,v3.4s},[x7]
335	add	x7,x3,#16
336	add	x6,x3,#16*4
337	add	x12,x3,#16*5
338	aese	v0.16b,v16.16b
339	aesmc	v0.16b,v0.16b
340	add	x14,x3,#16*6
341	add	x3,x3,#16*7
342	b	.Lenter_cbc_enc
343
344.align	4
345.Loop_cbc_enc:
346	aese	v0.16b,v16.16b
347	aesmc	v0.16b,v0.16b
348	st1	{v6.16b},[x1],#16
349.Lenter_cbc_enc:
350	aese	v0.16b,v17.16b
351	aesmc	v0.16b,v0.16b
352	aese	v0.16b,v2.16b
353	aesmc	v0.16b,v0.16b
354	ld1	{v16.4s},[x6]
355	cmp	w5,#4
356	aese	v0.16b,v3.16b
357	aesmc	v0.16b,v0.16b
358	ld1	{v17.4s},[x12]
359	b.eq	.Lcbc_enc192
360
361	aese	v0.16b,v16.16b
362	aesmc	v0.16b,v0.16b
363	ld1	{v16.4s},[x14]
364	aese	v0.16b,v17.16b
365	aesmc	v0.16b,v0.16b
366	ld1	{v17.4s},[x3]
367	nop
368
369.Lcbc_enc192:
370	aese	v0.16b,v16.16b
371	aesmc	v0.16b,v0.16b
372	subs	x2,x2,#16
373	aese	v0.16b,v17.16b
374	aesmc	v0.16b,v0.16b
375	csel	x8,xzr,x8,eq
376	aese	v0.16b,v18.16b
377	aesmc	v0.16b,v0.16b
378	aese	v0.16b,v19.16b
379	aesmc	v0.16b,v0.16b
380	ld1	{v16.16b},[x0],x8
381	aese	v0.16b,v20.16b
382	aesmc	v0.16b,v0.16b
383	eor	v16.16b,v16.16b,v5.16b
384	aese	v0.16b,v21.16b
385	aesmc	v0.16b,v0.16b
386	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
387	aese	v0.16b,v22.16b
388	aesmc	v0.16b,v0.16b
389	aese	v0.16b,v23.16b
390	eor	v6.16b,v0.16b,v7.16b
391	b.hs	.Loop_cbc_enc
392
393	st1	{v6.16b},[x1],#16
394	b	.Lcbc_done
395
396.align	5
397.Lcbc_enc128:
398	ld1	{v2.4s,v3.4s},[x7]
399	aese	v0.16b,v16.16b
400	aesmc	v0.16b,v0.16b
401	b	.Lenter_cbc_enc128
402.Loop_cbc_enc128:
403	aese	v0.16b,v16.16b
404	aesmc	v0.16b,v0.16b
405	st1	{v6.16b},[x1],#16
406.Lenter_cbc_enc128:
407	aese	v0.16b,v17.16b
408	aesmc	v0.16b,v0.16b
409	subs	x2,x2,#16
410	aese	v0.16b,v2.16b
411	aesmc	v0.16b,v0.16b
412	csel	x8,xzr,x8,eq
413	aese	v0.16b,v3.16b
414	aesmc	v0.16b,v0.16b
415	aese	v0.16b,v18.16b
416	aesmc	v0.16b,v0.16b
417	aese	v0.16b,v19.16b
418	aesmc	v0.16b,v0.16b
419	ld1	{v16.16b},[x0],x8
420	aese	v0.16b,v20.16b
421	aesmc	v0.16b,v0.16b
422	aese	v0.16b,v21.16b
423	aesmc	v0.16b,v0.16b
424	aese	v0.16b,v22.16b
425	aesmc	v0.16b,v0.16b
426	eor	v16.16b,v16.16b,v5.16b
427	aese	v0.16b,v23.16b
428	eor	v6.16b,v0.16b,v7.16b
429	b.hs	.Loop_cbc_enc128
430
431	st1	{v6.16b},[x1],#16
432	b	.Lcbc_done
433.align	5
434.Lcbc_dec:
435	ld1	{v18.16b},[x0],#16
436	subs	x2,x2,#32		// bias
437	add	w6,w5,#2
438	orr	v3.16b,v0.16b,v0.16b
439	orr	v1.16b,v0.16b,v0.16b
440	orr	v19.16b,v18.16b,v18.16b
441	b.lo	.Lcbc_dec_tail
442
443	orr	v1.16b,v18.16b,v18.16b
444	ld1	{v18.16b},[x0],#16
445	orr	v2.16b,v0.16b,v0.16b
446	orr	v3.16b,v1.16b,v1.16b
447	orr	v19.16b,v18.16b,v18.16b
448
449.Loop3x_cbc_dec:
450	aesd	v0.16b,v16.16b
451	aesimc	v0.16b,v0.16b
452	aesd	v1.16b,v16.16b
453	aesimc	v1.16b,v1.16b
454	aesd	v18.16b,v16.16b
455	aesimc	v18.16b,v18.16b
456	ld1	{v16.4s},[x7],#16
457	subs	w6,w6,#2
458	aesd	v0.16b,v17.16b
459	aesimc	v0.16b,v0.16b
460	aesd	v1.16b,v17.16b
461	aesimc	v1.16b,v1.16b
462	aesd	v18.16b,v17.16b
463	aesimc	v18.16b,v18.16b
464	ld1	{v17.4s},[x7],#16
465	b.gt	.Loop3x_cbc_dec
466
467	aesd	v0.16b,v16.16b
468	aesimc	v0.16b,v0.16b
469	aesd	v1.16b,v16.16b
470	aesimc	v1.16b,v1.16b
471	aesd	v18.16b,v16.16b
472	aesimc	v18.16b,v18.16b
473	eor	v4.16b,v6.16b,v7.16b
474	subs	x2,x2,#0x30
475	eor	v5.16b,v2.16b,v7.16b
476	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
477	aesd	v0.16b,v17.16b
478	aesimc	v0.16b,v0.16b
479	aesd	v1.16b,v17.16b
480	aesimc	v1.16b,v1.16b
481	aesd	v18.16b,v17.16b
482	aesimc	v18.16b,v18.16b
483	eor	v17.16b,v3.16b,v7.16b
484	add	x0,x0,x6		// x0 is adjusted in such way that
485					// at exit from the loop v1.16b-v18.16b
486					// are loaded with last "words"
487	orr	v6.16b,v19.16b,v19.16b
488	mov	x7,x3
489	aesd	v0.16b,v20.16b
490	aesimc	v0.16b,v0.16b
491	aesd	v1.16b,v20.16b
492	aesimc	v1.16b,v1.16b
493	aesd	v18.16b,v20.16b
494	aesimc	v18.16b,v18.16b
495	ld1	{v2.16b},[x0],#16
496	aesd	v0.16b,v21.16b
497	aesimc	v0.16b,v0.16b
498	aesd	v1.16b,v21.16b
499	aesimc	v1.16b,v1.16b
500	aesd	v18.16b,v21.16b
501	aesimc	v18.16b,v18.16b
502	ld1	{v3.16b},[x0],#16
503	aesd	v0.16b,v22.16b
504	aesimc	v0.16b,v0.16b
505	aesd	v1.16b,v22.16b
506	aesimc	v1.16b,v1.16b
507	aesd	v18.16b,v22.16b
508	aesimc	v18.16b,v18.16b
509	ld1	{v19.16b},[x0],#16
510	aesd	v0.16b,v23.16b
511	aesd	v1.16b,v23.16b
512	aesd	v18.16b,v23.16b
513	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
514	add	w6,w5,#2
515	eor	v4.16b,v4.16b,v0.16b
516	eor	v5.16b,v5.16b,v1.16b
517	eor	v18.16b,v18.16b,v17.16b
518	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
519	st1	{v4.16b},[x1],#16
520	orr	v0.16b,v2.16b,v2.16b
521	st1	{v5.16b},[x1],#16
522	orr	v1.16b,v3.16b,v3.16b
523	st1	{v18.16b},[x1],#16
524	orr	v18.16b,v19.16b,v19.16b
525	b.hs	.Loop3x_cbc_dec
526
527	cmn	x2,#0x30
528	b.eq	.Lcbc_done
529	nop
530
531.Lcbc_dec_tail:
532	aesd	v1.16b,v16.16b
533	aesimc	v1.16b,v1.16b
534	aesd	v18.16b,v16.16b
535	aesimc	v18.16b,v18.16b
536	ld1	{v16.4s},[x7],#16
537	subs	w6,w6,#2
538	aesd	v1.16b,v17.16b
539	aesimc	v1.16b,v1.16b
540	aesd	v18.16b,v17.16b
541	aesimc	v18.16b,v18.16b
542	ld1	{v17.4s},[x7],#16
543	b.gt	.Lcbc_dec_tail
544
545	aesd	v1.16b,v16.16b
546	aesimc	v1.16b,v1.16b
547	aesd	v18.16b,v16.16b
548	aesimc	v18.16b,v18.16b
549	aesd	v1.16b,v17.16b
550	aesimc	v1.16b,v1.16b
551	aesd	v18.16b,v17.16b
552	aesimc	v18.16b,v18.16b
553	aesd	v1.16b,v20.16b
554	aesimc	v1.16b,v1.16b
555	aesd	v18.16b,v20.16b
556	aesimc	v18.16b,v18.16b
557	cmn	x2,#0x20
558	aesd	v1.16b,v21.16b
559	aesimc	v1.16b,v1.16b
560	aesd	v18.16b,v21.16b
561	aesimc	v18.16b,v18.16b
562	eor	v5.16b,v6.16b,v7.16b
563	aesd	v1.16b,v22.16b
564	aesimc	v1.16b,v1.16b
565	aesd	v18.16b,v22.16b
566	aesimc	v18.16b,v18.16b
567	eor	v17.16b,v3.16b,v7.16b
568	aesd	v1.16b,v23.16b
569	aesd	v18.16b,v23.16b
570	b.eq	.Lcbc_dec_one
571	eor	v5.16b,v5.16b,v1.16b
572	eor	v17.16b,v17.16b,v18.16b
573	orr	v6.16b,v19.16b,v19.16b
574	st1	{v5.16b},[x1],#16
575	st1	{v17.16b},[x1],#16
576	b	.Lcbc_done
577
578.Lcbc_dec_one:
579	eor	v5.16b,v5.16b,v18.16b
580	orr	v6.16b,v19.16b,v19.16b
581	st1	{v5.16b},[x1],#16
582
583.Lcbc_done:
584	st1	{v6.16b},[x4]
585.Lcbc_abort:
586	ldr	x29,[sp],#16
587	ret
588.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
589.globl	aes_hw_ctr32_encrypt_blocks
590.hidden	aes_hw_ctr32_encrypt_blocks
591.type	aes_hw_ctr32_encrypt_blocks,%function
592.align	5
593aes_hw_ctr32_encrypt_blocks:
594	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
595	AARCH64_VALID_CALL_TARGET
596	stp	x29,x30,[sp,#-16]!
597	add	x29,sp,#0
598	ldr	w5,[x3,#240]
599
600	ldr	w8, [x4, #12]
601	ld1	{v0.4s},[x4]
602
603	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
604	sub	w5,w5,#4
605	mov	x12,#16
606	cmp	x2,#2
607	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
608	sub	w5,w5,#2
609	ld1	{v20.4s,v21.4s},[x7],#32
610	ld1	{v22.4s,v23.4s},[x7],#32
611	ld1	{v7.4s},[x7]
612	add	x7,x3,#32
613	mov	w6,w5
614	csel	x12,xzr,x12,lo
615
616	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
617	// affected by silicon errata #1742098 [0] and #1655431 [1],
618	// respectively, where the second instruction of an aese/aesmc
619	// instruction pair may execute twice if an interrupt is taken right
620	// after the first instruction consumes an input register of which a
621	// single 32-bit lane has been updated the last time it was modified.
622	//
623	// This function uses a counter in one 32-bit lane. The vmov lines
624	// could write to v1.16b and v18.16b directly, but that trips this bugs.
625	// We write to v6.16b and copy to the final register as a workaround.
626	//
627	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
628	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
629#ifndef __AARCH64EB__
630	rev	w8, w8
631#endif
632	add	w10, w8, #1
633	orr	v6.16b,v0.16b,v0.16b
634	rev	w10, w10
635	mov	v6.s[3],w10
636	add	w8, w8, #2
637	orr	v1.16b,v6.16b,v6.16b
638	b.ls	.Lctr32_tail
639	rev	w12, w8
640	mov	v6.s[3],w12
641	sub	x2,x2,#3		// bias
642	orr	v18.16b,v6.16b,v6.16b
643	b	.Loop3x_ctr32
644
645.align	4
646.Loop3x_ctr32:
647	aese	v0.16b,v16.16b
648	aesmc	v0.16b,v0.16b
649	aese	v1.16b,v16.16b
650	aesmc	v1.16b,v1.16b
651	aese	v18.16b,v16.16b
652	aesmc	v18.16b,v18.16b
653	ld1	{v16.4s},[x7],#16
654	subs	w6,w6,#2
655	aese	v0.16b,v17.16b
656	aesmc	v0.16b,v0.16b
657	aese	v1.16b,v17.16b
658	aesmc	v1.16b,v1.16b
659	aese	v18.16b,v17.16b
660	aesmc	v18.16b,v18.16b
661	ld1	{v17.4s},[x7],#16
662	b.gt	.Loop3x_ctr32
663
664	aese	v0.16b,v16.16b
665	aesmc	v4.16b,v0.16b
666	aese	v1.16b,v16.16b
667	aesmc	v5.16b,v1.16b
668	ld1	{v2.16b},[x0],#16
669	add	w9,w8,#1
670	aese	v18.16b,v16.16b
671	aesmc	v18.16b,v18.16b
672	ld1	{v3.16b},[x0],#16
673	rev	w9,w9
674	aese	v4.16b,v17.16b
675	aesmc	v4.16b,v4.16b
676	aese	v5.16b,v17.16b
677	aesmc	v5.16b,v5.16b
678	ld1	{v19.16b},[x0],#16
679	mov	x7,x3
680	aese	v18.16b,v17.16b
681	aesmc	v17.16b,v18.16b
682	aese	v4.16b,v20.16b
683	aesmc	v4.16b,v4.16b
684	aese	v5.16b,v20.16b
685	aesmc	v5.16b,v5.16b
686	eor	v2.16b,v2.16b,v7.16b
687	add	w10,w8,#2
688	aese	v17.16b,v20.16b
689	aesmc	v17.16b,v17.16b
690	eor	v3.16b,v3.16b,v7.16b
691	add	w8,w8,#3
692	aese	v4.16b,v21.16b
693	aesmc	v4.16b,v4.16b
694	aese	v5.16b,v21.16b
695	aesmc	v5.16b,v5.16b
696	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
697	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
698	 // 32-bit mode. See the comment above.
699	eor	v19.16b,v19.16b,v7.16b
700	mov	v6.s[3], w9
701	aese	v17.16b,v21.16b
702	aesmc	v17.16b,v17.16b
703	orr	v0.16b,v6.16b,v6.16b
704	rev	w10,w10
705	aese	v4.16b,v22.16b
706	aesmc	v4.16b,v4.16b
707	mov	v6.s[3], w10
708	rev	w12,w8
709	aese	v5.16b,v22.16b
710	aesmc	v5.16b,v5.16b
711	orr	v1.16b,v6.16b,v6.16b
712	mov	v6.s[3], w12
713	aese	v17.16b,v22.16b
714	aesmc	v17.16b,v17.16b
715	orr	v18.16b,v6.16b,v6.16b
716	subs	x2,x2,#3
717	aese	v4.16b,v23.16b
718	aese	v5.16b,v23.16b
719	aese	v17.16b,v23.16b
720
721	eor	v2.16b,v2.16b,v4.16b
722	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
723	st1	{v2.16b},[x1],#16
724	eor	v3.16b,v3.16b,v5.16b
725	mov	w6,w5
726	st1	{v3.16b},[x1],#16
727	eor	v19.16b,v19.16b,v17.16b
728	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
729	st1	{v19.16b},[x1],#16
730	b.hs	.Loop3x_ctr32
731
732	adds	x2,x2,#3
733	b.eq	.Lctr32_done
734	cmp	x2,#1
735	mov	x12,#16
736	csel	x12,xzr,x12,eq
737
738.Lctr32_tail:
739	aese	v0.16b,v16.16b
740	aesmc	v0.16b,v0.16b
741	aese	v1.16b,v16.16b
742	aesmc	v1.16b,v1.16b
743	ld1	{v16.4s},[x7],#16
744	subs	w6,w6,#2
745	aese	v0.16b,v17.16b
746	aesmc	v0.16b,v0.16b
747	aese	v1.16b,v17.16b
748	aesmc	v1.16b,v1.16b
749	ld1	{v17.4s},[x7],#16
750	b.gt	.Lctr32_tail
751
752	aese	v0.16b,v16.16b
753	aesmc	v0.16b,v0.16b
754	aese	v1.16b,v16.16b
755	aesmc	v1.16b,v1.16b
756	aese	v0.16b,v17.16b
757	aesmc	v0.16b,v0.16b
758	aese	v1.16b,v17.16b
759	aesmc	v1.16b,v1.16b
760	ld1	{v2.16b},[x0],x12
761	aese	v0.16b,v20.16b
762	aesmc	v0.16b,v0.16b
763	aese	v1.16b,v20.16b
764	aesmc	v1.16b,v1.16b
765	ld1	{v3.16b},[x0]
766	aese	v0.16b,v21.16b
767	aesmc	v0.16b,v0.16b
768	aese	v1.16b,v21.16b
769	aesmc	v1.16b,v1.16b
770	eor	v2.16b,v2.16b,v7.16b
771	aese	v0.16b,v22.16b
772	aesmc	v0.16b,v0.16b
773	aese	v1.16b,v22.16b
774	aesmc	v1.16b,v1.16b
775	eor	v3.16b,v3.16b,v7.16b
776	aese	v0.16b,v23.16b
777	aese	v1.16b,v23.16b
778
779	cmp	x2,#1
780	eor	v2.16b,v2.16b,v0.16b
781	eor	v3.16b,v3.16b,v1.16b
782	st1	{v2.16b},[x1],#16
783	b.eq	.Lctr32_done
784	st1	{v3.16b},[x1]
785
786.Lctr32_done:
787	ldr	x29,[sp],#16
788	ret
789.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
790#endif
791#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
792