xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/vpaes-armv8-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <openssl/arm_arch.h>
8
9.section	.rodata
10
11.type	_vpaes_consts,%object
12.align	7	// totally strategic alignment
13_vpaes_consts:
14.Lk_mc_forward:	//	mc_forward
15.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
16.quad	0x080B0A0904070605, 0x000302010C0F0E0D
17.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
18.quad	0x000302010C0F0E0D, 0x080B0A0904070605
19.Lk_mc_backward:	//	mc_backward
20.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
21.quad	0x020100030E0D0C0F, 0x0A09080B06050407
22.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
23.quad	0x0A09080B06050407, 0x020100030E0D0C0F
24.Lk_sr:	//	sr
25.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
26.quad	0x030E09040F0A0500, 0x0B06010C07020D08
27.quad	0x0F060D040B020900, 0x070E050C030A0108
28.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
29
30//
31// "Hot" constants
32//
33.Lk_inv:	//	inv, inva
34.quad	0x0E05060F0D080180, 0x040703090A0B0C02
35.quad	0x01040A060F0B0780, 0x030D0E0C02050809
36.Lk_ipt:	//	input transform (lo, hi)
37.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
38.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
39.Lk_sbo:	//	sbou, sbot
40.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
41.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
42.Lk_sb1:	//	sb1u, sb1t
43.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
44.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
45.Lk_sb2:	//	sb2u, sb2t
46.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
47.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
48
49//
50//  Decryption stuff
51//
52.Lk_dipt:	//	decryption input transform
53.quad	0x0F505B040B545F00, 0x154A411E114E451A
54.quad	0x86E383E660056500, 0x12771772F491F194
55.Lk_dsbo:	//	decryption sbox final output
56.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
57.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
58.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
59.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
60.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
61.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
62.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
63.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
64.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
65.quad	0xD022649296B44200, 0x602646F6B0F2D404
66.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
67.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
68.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
69.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
70
71//
72//  Key schedule constants
73//
74.Lk_dksd:	//	decryption key schedule: invskew x*D
75.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
76.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
77.Lk_dksb:	//	decryption key schedule: invskew x*B
78.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
79.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
80.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
81.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
82.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
83.Lk_dks9:	//	decryption key schedule: invskew x*9
84.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
85.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
86
87.Lk_rcon:	//	rcon
88.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
89
90.Lk_opt:	//	output transform
91.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
92.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
93.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
94.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
95.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
96
97.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
98.align	2
99.size	_vpaes_consts,.-_vpaes_consts
100.align	6
101
102.text
103##
104##  _aes_preheat
105##
106##  Fills register %r10 -> .aes_consts (so you can -fPIC)
107##  and %xmm9-%xmm15 as specified below.
108##
109.type	_vpaes_encrypt_preheat,%function
110.align	4
111_vpaes_encrypt_preheat:
112	adrp	x10, .Lk_inv
113	add	x10, x10, :lo12:.Lk_inv
114	movi	v17.16b, #0x0f
115	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
116	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
117	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
118	ret
119.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
120
121##
122##  _aes_encrypt_core
123##
124##  AES-encrypt %xmm0.
125##
126##  Inputs:
127##     %xmm0 = input
128##     %xmm9-%xmm15 as in _vpaes_preheat
129##    (%rdx) = scheduled keys
130##
131##  Output in %xmm0
132##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
133##  Preserves %xmm6 - %xmm8 so you get some local vectors
134##
135##
136.type	_vpaes_encrypt_core,%function
137.align	4
138_vpaes_encrypt_core:
139	mov	x9, x2
140	ldr	w8, [x2,#240]			// pull rounds
141	adrp	x11, .Lk_mc_forward+16
142	add	x11, x11, :lo12:.Lk_mc_forward+16
143						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
144	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
145	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
146	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
147	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
148						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
149	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
150	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
151	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
152	b	.Lenc_entry
153
154.align	4
155.Lenc_loop:
156	// middle of middle round
157	add	x10, x11, #0x40
158	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
159	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
160	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
161	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
162	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
163	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
164	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
165	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
166	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
167	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
168	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
169	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
170	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
171	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
172	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
173	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
174	sub	w8, w8, #1			// nr--
175
176.Lenc_entry:
177	// top of round
178	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
179	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
180	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
181	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
182	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
183	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
184	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
185	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
186	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
187	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
188	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
189	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
190	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
191	cbnz	w8, .Lenc_loop
192
193	// middle of last round
194	add	x10, x11, #0x80
195						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
196						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
197	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
198	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
199	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
200	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
201	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
202	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
203	ret
204.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
205
206.globl	vpaes_encrypt
207.hidden	vpaes_encrypt
208.type	vpaes_encrypt,%function
209.align	4
210vpaes_encrypt:
211	AARCH64_SIGN_LINK_REGISTER
212	stp	x29,x30,[sp,#-16]!
213	add	x29,sp,#0
214
215	ld1	{v7.16b}, [x0]
216	bl	_vpaes_encrypt_preheat
217	bl	_vpaes_encrypt_core
218	st1	{v0.16b}, [x1]
219
220	ldp	x29,x30,[sp],#16
221	AARCH64_VALIDATE_LINK_REGISTER
222	ret
223.size	vpaes_encrypt,.-vpaes_encrypt
224
225.type	_vpaes_encrypt_2x,%function
226.align	4
227_vpaes_encrypt_2x:
228	mov	x9, x2
229	ldr	w8, [x2,#240]			// pull rounds
230	adrp	x11, .Lk_mc_forward+16
231	add	x11, x11, :lo12:.Lk_mc_forward+16
232						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
233	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
234	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
235	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
236	and	v9.16b,  v15.16b,  v17.16b
237	ushr	v8.16b,  v15.16b,  #4
238	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
239	tbl	v9.16b,  {v20.16b}, v9.16b
240						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
241	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
242	tbl	v10.16b, {v21.16b}, v8.16b
243	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
244	eor	v8.16b,  v9.16b,   v16.16b
245	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
246	eor	v8.16b,  v8.16b,   v10.16b
247	b	.Lenc_2x_entry
248
249.align	4
250.Lenc_2x_loop:
251	// middle of middle round
252	add	x10, x11, #0x40
253	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
254	tbl	v12.16b, {v25.16b}, v10.16b
255	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
256	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
257	tbl	v8.16b,  {v24.16b}, v11.16b
258	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
259	eor	v12.16b, v12.16b, v16.16b
260	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
261	tbl	v13.16b, {v27.16b}, v10.16b
262	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
263	eor	v8.16b,  v8.16b,  v12.16b
264	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
265	tbl	v10.16b, {v26.16b}, v11.16b
266	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
267	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
268	tbl	v11.16b, {v8.16b}, v1.16b
269	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
270	eor	v10.16b, v10.16b, v13.16b
271	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
272	tbl	v8.16b,  {v8.16b}, v4.16b
273	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
274	eor	v11.16b, v11.16b, v10.16b
275	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
276	tbl	v12.16b, {v11.16b},v1.16b
277	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
278	eor	v8.16b,  v8.16b,  v11.16b
279	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
280	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
281	eor	v8.16b,  v8.16b,  v12.16b
282	sub	w8, w8, #1			// nr--
283
284.Lenc_2x_entry:
285	// top of round
286	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
287	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
288	and	v9.16b,  v8.16b, v17.16b
289	ushr	v8.16b,  v8.16b, #4
290	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
291	tbl	v13.16b, {v19.16b},v9.16b
292	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
293	eor	v9.16b,  v9.16b,  v8.16b
294	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
295	tbl	v11.16b, {v18.16b},v8.16b
296	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
297	tbl	v12.16b, {v18.16b},v9.16b
298	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
299	eor	v11.16b, v11.16b, v13.16b
300	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
301	eor	v12.16b, v12.16b, v13.16b
302	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
303	tbl	v10.16b, {v18.16b},v11.16b
304	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
305	tbl	v11.16b, {v18.16b},v12.16b
306	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
307	eor	v10.16b, v10.16b, v9.16b
308	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
309	eor	v11.16b, v11.16b, v8.16b
310	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
311	cbnz	w8, .Lenc_2x_loop
312
313	// middle of last round
314	add	x10, x11, #0x80
315						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
316						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
317	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
318	tbl	v12.16b, {v22.16b}, v10.16b
319	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
320	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
321	tbl	v8.16b,  {v23.16b}, v11.16b
322	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
323	eor	v12.16b, v12.16b, v16.16b
324	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
325	eor	v8.16b,  v8.16b,  v12.16b
326	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
327	tbl	v1.16b,  {v8.16b},v1.16b
328	ret
329.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
330
331.type	_vpaes_decrypt_preheat,%function
332.align	4
333_vpaes_decrypt_preheat:
334	adrp	x10, .Lk_inv
335	add	x10, x10, :lo12:.Lk_inv
336	movi	v17.16b, #0x0f
337	adrp	x11, .Lk_dipt
338	add	x11, x11, :lo12:.Lk_dipt
339	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
340	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
341	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
342	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
343	ret
344.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
345
346##
347##  Decryption core
348##
349##  Same API as encryption core.
350##
351.type	_vpaes_decrypt_core,%function
352.align	4
353_vpaes_decrypt_core:
354	mov	x9, x2
355	ldr	w8, [x2,#240]			// pull rounds
356
357						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
358	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
359	eor	x11, x11, #0x30			// xor		$0x30,	%r11
360	adrp	x10, .Lk_sr
361	add	x10, x10, :lo12:.Lk_sr
362	and	x11, x11, #0x30			// and		$0x30,	%r11
363	add	x11, x11, x10
364	adrp	x10, .Lk_mc_forward+48
365	add	x10, x10, :lo12:.Lk_mc_forward+48
366
367	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
368	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
369	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
370	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
371	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
372						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
373	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
374	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
375	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
376	b	.Ldec_entry
377
378.align	4
379.Ldec_loop:
380//
381//  Inverse mix columns
382//
383						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
384						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
385	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
386	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
387	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
388						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
389	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
390						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
391
392	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
393	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
394	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
395	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
396						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
397	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
398						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
399
400	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
401	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
402	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
403	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
404						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
405	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
406						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
407
408	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
409	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
410	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
411	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
412	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
413	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
414	sub	w8, w8, #1			// sub		$1,%rax			# nr--
415
416.Ldec_entry:
417	// top of round
418	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
419	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
420	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
421	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
422	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
423	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
424	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
425	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
426	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
427	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
428	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
429	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
430	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
431	cbnz	w8, .Ldec_loop
432
433	// middle of last round
434						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
435	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
436						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
437	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
438	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
439	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
440	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
441	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
442	ret
443.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
444
445.globl	vpaes_decrypt
446.hidden	vpaes_decrypt
447.type	vpaes_decrypt,%function
448.align	4
449vpaes_decrypt:
450	AARCH64_SIGN_LINK_REGISTER
451	stp	x29,x30,[sp,#-16]!
452	add	x29,sp,#0
453
454	ld1	{v7.16b}, [x0]
455	bl	_vpaes_decrypt_preheat
456	bl	_vpaes_decrypt_core
457	st1	{v0.16b}, [x1]
458
459	ldp	x29,x30,[sp],#16
460	AARCH64_VALIDATE_LINK_REGISTER
461	ret
462.size	vpaes_decrypt,.-vpaes_decrypt
463
464// v14-v15 input, v0-v1 output
465.type	_vpaes_decrypt_2x,%function
466.align	4
467_vpaes_decrypt_2x:
468	mov	x9, x2
469	ldr	w8, [x2,#240]			// pull rounds
470
471						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
472	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
473	eor	x11, x11, #0x30			// xor		$0x30,	%r11
474	adrp	x10, .Lk_sr
475	add	x10, x10, :lo12:.Lk_sr
476	and	x11, x11, #0x30			// and		$0x30,	%r11
477	add	x11, x11, x10
478	adrp	x10, .Lk_mc_forward+48
479	add	x10, x10, :lo12:.Lk_mc_forward+48
480
481	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
482	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
483	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
484	and	v9.16b,  v15.16b, v17.16b
485	ushr	v8.16b,  v15.16b, #4
486	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
487	tbl	v10.16b, {v20.16b},v9.16b
488	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
489						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
490	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
491	tbl	v8.16b,  {v21.16b},v8.16b
492	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
493	eor	v10.16b, v10.16b, v16.16b
494	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
495	eor	v8.16b,  v8.16b,  v10.16b
496	b	.Ldec_2x_entry
497
498.align	4
499.Ldec_2x_loop:
500//
501//  Inverse mix columns
502//
503						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
504						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
505	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
506	tbl	v12.16b, {v24.16b}, v10.16b
507	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
508	tbl	v9.16b,  {v25.16b}, v11.16b
509	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
510	eor	v8.16b,  v12.16b, v16.16b
511						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
512	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
513	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
514						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
515
516	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
517	tbl	v12.16b, {v26.16b}, v10.16b
518	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
519	tbl	v8.16b,  {v8.16b},v5.16b
520	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
521	tbl	v9.16b,  {v27.16b}, v11.16b
522	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
523	eor	v8.16b,  v8.16b,  v12.16b
524						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
525	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
526	eor	v8.16b,  v8.16b,  v9.16b
527						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
528
529	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
530	tbl	v12.16b, {v28.16b}, v10.16b
531	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
532	tbl	v8.16b,  {v8.16b},v5.16b
533	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
534	tbl	v9.16b,  {v29.16b}, v11.16b
535	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
536	eor	v8.16b,  v8.16b,  v12.16b
537						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
538	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
539	eor	v8.16b,  v8.16b,  v9.16b
540						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
541
542	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
543	tbl	v12.16b, {v30.16b}, v10.16b
544	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
545	tbl	v8.16b,  {v8.16b},v5.16b
546	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
547	tbl	v9.16b,  {v31.16b}, v11.16b
548	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
549	eor	v8.16b,  v8.16b,  v12.16b
550	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
551	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
552	eor	v8.16b,  v8.16b,  v9.16b
553	sub	w8, w8, #1			// sub		$1,%rax			# nr--
554
555.Ldec_2x_entry:
556	// top of round
557	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
558	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
559	and	v9.16b,  v8.16b,  v17.16b
560	ushr	v8.16b,  v8.16b,  #4
561	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
562	tbl	v10.16b, {v19.16b},v9.16b
563	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
564	eor	v9.16b,	 v9.16b,  v8.16b
565	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
566	tbl	v11.16b, {v18.16b},v8.16b
567	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
568	tbl	v12.16b, {v18.16b},v9.16b
569	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
570	eor	v11.16b, v11.16b, v10.16b
571	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
572	eor	v12.16b, v12.16b, v10.16b
573	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
574	tbl	v10.16b, {v18.16b},v11.16b
575	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
576	tbl	v11.16b, {v18.16b},v12.16b
577	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
578	eor	v10.16b, v10.16b, v9.16b
579	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
580	eor	v11.16b, v11.16b, v8.16b
581	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
582	cbnz	w8, .Ldec_2x_loop
583
584	// middle of last round
585						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
586	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
587	tbl	v12.16b, {v22.16b}, v10.16b
588						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
589	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
590	tbl	v9.16b,  {v23.16b}, v11.16b
591	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
592	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
593	eor	v12.16b, v12.16b, v16.16b
594	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
595	eor	v8.16b,  v9.16b,  v12.16b
596	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
597	tbl	v1.16b,  {v8.16b},v2.16b
598	ret
599.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
600########################################################
601##                                                    ##
602##                  AES key schedule                  ##
603##                                                    ##
604########################################################
605.type	_vpaes_key_preheat,%function
606.align	4
607_vpaes_key_preheat:
608	adrp	x10, .Lk_inv
609	add	x10, x10, :lo12:.Lk_inv
610	movi	v16.16b, #0x5b			// .Lk_s63
611	adrp	x11, .Lk_sb1
612	add	x11, x11, :lo12:.Lk_sb1
613	movi	v17.16b, #0x0f			// .Lk_s0F
614	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
615	adrp	x10, .Lk_dksd
616	add	x10, x10, :lo12:.Lk_dksd
617	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
618	adrp	x11, .Lk_mc_forward
619	add	x11, x11, :lo12:.Lk_mc_forward
620	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
621	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
622	ld1	{v8.2d}, [x10]			// .Lk_rcon
623	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
624	ret
625.size	_vpaes_key_preheat,.-_vpaes_key_preheat
626
627.type	_vpaes_schedule_core,%function
628.align	4
629_vpaes_schedule_core:
630	AARCH64_SIGN_LINK_REGISTER
631	stp	x29, x30, [sp,#-16]!
632	add	x29,sp,#0
633
634	bl	_vpaes_key_preheat		// load the tables
635
636	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
637
638	// input transform
639	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
640	bl	_vpaes_schedule_transform
641	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
642
643	adrp	x10, .Lk_sr		// lea	.Lk_sr(%rip),%r10
644	add	x10, x10, :lo12:.Lk_sr
645
646	add	x8, x8, x10
647	cbnz	w3, .Lschedule_am_decrypting
648
649	// encrypting, output zeroth round key after transform
650	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
651	b	.Lschedule_go
652
653.Lschedule_am_decrypting:
654	// decrypting, output zeroth round key after shiftrows
655	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
656	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
657	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
658	eor	x8, x8, #0x30			// xor	$0x30, %r8
659
660.Lschedule_go:
661	cmp	w1, #192			// cmp	$192,	%esi
662	b.hi	.Lschedule_256
663	b.eq	.Lschedule_192
664	// 128: fall though
665
666##
667##  .schedule_128
668##
669##  128-bit specific part of key schedule.
670##
671##  This schedule is really simple, because all its parts
672##  are accomplished by the subroutines.
673##
674.Lschedule_128:
675	mov	x0, #10			// mov	$10, %esi
676
677.Loop_schedule_128:
678	sub	x0, x0, #1			// dec	%esi
679	bl	_vpaes_schedule_round
680	cbz	x0, .Lschedule_mangle_last
681	bl	_vpaes_schedule_mangle		// write output
682	b	.Loop_schedule_128
683
684##
685##  .aes_schedule_192
686##
687##  192-bit specific part of key schedule.
688##
689##  The main body of this schedule is the same as the 128-bit
690##  schedule, but with more smearing.  The long, high side is
691##  stored in %xmm7 as before, and the short, low side is in
692##  the high bits of %xmm6.
693##
694##  This schedule is somewhat nastier, however, because each
695##  round produces 192 bits of key material, or 1.5 round keys.
696##  Therefore, on each cycle we do 2 rounds and produce 3 round
697##  keys.
698##
699.align	4
700.Lschedule_192:
701	sub	x0, x0, #8
702	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
703	bl	_vpaes_schedule_transform	// input transform
704	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
705	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
706	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
707	mov	x0, #4			// mov	$4,	%esi
708
709.Loop_schedule_192:
710	sub	x0, x0, #1			// dec	%esi
711	bl	_vpaes_schedule_round
712	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
713	bl	_vpaes_schedule_mangle		// save key n
714	bl	_vpaes_schedule_192_smear
715	bl	_vpaes_schedule_mangle		// save key n+1
716	bl	_vpaes_schedule_round
717	cbz	x0, .Lschedule_mangle_last
718	bl	_vpaes_schedule_mangle		// save key n+2
719	bl	_vpaes_schedule_192_smear
720	b	.Loop_schedule_192
721
722##
723##  .aes_schedule_256
724##
725##  256-bit specific part of key schedule.
726##
727##  The structure here is very similar to the 128-bit
728##  schedule, but with an additional "low side" in
729##  %xmm6.  The low side's rounds are the same as the
730##  high side's, except no rcon and no rotation.
731##
732.align	4
733.Lschedule_256:
734	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
735	bl	_vpaes_schedule_transform	// input transform
736	mov	x0, #7			// mov	$7, %esi
737
738.Loop_schedule_256:
739	sub	x0, x0, #1			// dec	%esi
740	bl	_vpaes_schedule_mangle		// output low result
741	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
742
743	// high round
744	bl	_vpaes_schedule_round
745	cbz	x0, .Lschedule_mangle_last
746	bl	_vpaes_schedule_mangle
747
748	// low round. swap xmm7 and xmm6
749	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
750	movi	v4.16b, #0
751	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
752	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
753	bl	_vpaes_schedule_low_round
754	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
755
756	b	.Loop_schedule_256
757
758##
759##  .aes_schedule_mangle_last
760##
761##  Mangler for last round of key schedule
762##  Mangles %xmm0
763##    when encrypting, outputs out(%xmm0) ^ 63
764##    when decrypting, outputs unskew(%xmm0)
765##
766##  Always called right before return... jumps to cleanup and exits
767##
768.align	4
769.Lschedule_mangle_last:
770	// schedule last round key from xmm0
771	adrp	x11, .Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
772	add	x11, x11, :lo12:.Lk_deskew
773
774	cbnz	w3, .Lschedule_mangle_last_dec
775
776	// encrypting
777	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
778	adrp	x11, .Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
779	add	x11, x11, :lo12:.Lk_opt
780	add	x2, x2, #32			// add	$32,	%rdx
781	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
782
783.Lschedule_mangle_last_dec:
784	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
785	sub	x2, x2, #16			// add	$-16,	%rdx
786	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
787	bl	_vpaes_schedule_transform	// output transform
788	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
789
790	// cleanup
791	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
792	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
793	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
794	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
795	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
796	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
797	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
798	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
799	ldp	x29, x30, [sp],#16
800	AARCH64_VALIDATE_LINK_REGISTER
801	ret
802.size	_vpaes_schedule_core,.-_vpaes_schedule_core
803
804##
805##  .aes_schedule_192_smear
806##
807##  Smear the short, low side in the 192-bit key schedule.
808##
809##  Inputs:
810##    %xmm7: high side, b  a  x  y
811##    %xmm6:  low side, d  c  0  0
812##    %xmm13: 0
813##
814##  Outputs:
815##    %xmm6: b+c+d  b+c  0  0
816##    %xmm0: b+c+d  b+c  b  a
817##
818.type	_vpaes_schedule_192_smear,%function
819.align	4
820_vpaes_schedule_192_smear:
821	movi	v1.16b, #0
822	dup	v0.4s, v7.s[3]
823	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
824	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
825	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
826	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
827	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
828	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
829	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
830	ret
831.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
832
833##
834##  .aes_schedule_round
835##
836##  Runs one main round of the key schedule on %xmm0, %xmm7
837##
838##  Specifically, runs subbytes on the high dword of %xmm0
839##  then rotates it by one byte and xors into the low dword of
840##  %xmm7.
841##
842##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
843##  next rcon.
844##
845##  Smears the dwords of %xmm7 by xoring the low into the
846##  second low, result into third, result into highest.
847##
848##  Returns results in %xmm7 = %xmm0.
849##  Clobbers %xmm1-%xmm4, %r11.
850##
851.type	_vpaes_schedule_round,%function
852.align	4
853_vpaes_schedule_round:
854	// extract rcon from xmm8
855	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
856	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
857	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
858	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
859
860	// rotate
861	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
862	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
863
864	// fall through...
865
866	// low round: same as high round, but no rotation and no rcon.
867_vpaes_schedule_low_round:
868	// smear xmm7
869	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
870	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
871	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
872
873	// subbytes
874	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
875	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
876	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
877	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
878	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
879	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
880	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
881	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
882	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
883	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
884	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
885	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
886	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
887	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
888	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
889	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
890	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
891
892	// add in smeared stuff
893	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
894	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
895	ret
896.size	_vpaes_schedule_round,.-_vpaes_schedule_round
897
898##
899##  .aes_schedule_transform
900##
901##  Linear-transform %xmm0 according to tables at (%r11)
902##
903##  Requires that %xmm9 = 0x0F0F... as in preheat
904##  Output in %xmm0
905##  Clobbers %xmm1, %xmm2
906##
907.type	_vpaes_schedule_transform,%function
908.align	4
909_vpaes_schedule_transform:
910	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
911	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
912						// vmovdqa	(%r11),	%xmm2 	# lo
913	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
914						// vmovdqa	16(%r11),	%xmm1 # hi
915	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
916	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
917	ret
918.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
919
920##
921##  .aes_schedule_mangle
922##
923##  Mangle xmm0 from (basis-transformed) standard version
924##  to our version.
925##
926##  On encrypt,
927##    xor with 0x63
928##    multiply by circulant 0,1,1,1
929##    apply shiftrows transform
930##
931##  On decrypt,
932##    xor with 0x63
933##    multiply by "inverse mixcolumns" circulant E,B,D,9
934##    deskew
935##    apply shiftrows transform
936##
937##
938##  Writes out to (%rdx), and increments or decrements it
939##  Keeps track of round number mod 4 in %r8
940##  Preserves xmm0
941##  Clobbers xmm1-xmm5
942##
943.type	_vpaes_schedule_mangle,%function
944.align	4
945_vpaes_schedule_mangle:
946	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
947						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
948	cbnz	w3, .Lschedule_mangle_dec
949
950	// encrypting
951	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
952	add	x2, x2, #16			// add	$16,	%rdx
953	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
954	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
955	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
956	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
957	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
958	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
959
960	b	.Lschedule_mangle_both
961.align	4
962.Lschedule_mangle_dec:
963	// inverse mix columns
964						// lea	.Lk_dksd(%rip),%r11
965	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
966	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
967
968						// vmovdqa	0x00(%r11),	%xmm2
969	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
970						// vmovdqa	0x10(%r11),	%xmm3
971	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
972	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
973	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
974
975						// vmovdqa	0x20(%r11),	%xmm2
976	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
977	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
978						// vmovdqa	0x30(%r11),	%xmm3
979	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
980	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
981	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
982
983						// vmovdqa	0x40(%r11),	%xmm2
984	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
985	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
986						// vmovdqa	0x50(%r11),	%xmm3
987	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
988	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
989
990						// vmovdqa	0x60(%r11),	%xmm2
991	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
992	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
993						// vmovdqa	0x70(%r11),	%xmm4
994	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
995	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
996	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
997	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
998
999	sub	x2, x2, #16			// add	$-16,	%rdx
1000
1001.Lschedule_mangle_both:
1002	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1003	add	x8, x8, #48			// add	$-16,	%r8
1004	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
1005	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
1006	ret
1007.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1008
1009.globl	vpaes_set_encrypt_key
1010.hidden	vpaes_set_encrypt_key
1011.type	vpaes_set_encrypt_key,%function
1012.align	4
1013vpaes_set_encrypt_key:
1014	AARCH64_SIGN_LINK_REGISTER
1015	stp	x29,x30,[sp,#-16]!
1016	add	x29,sp,#0
1017	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1018
1019	lsr	w9, w1, #5		// shr	$5,%eax
1020	add	w9, w9, #5		// $5,%eax
1021	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1022
1023	mov	w3, #0		// mov	$0,%ecx
1024	mov	x8, #0x30		// mov	$0x30,%r8d
1025	bl	_vpaes_schedule_core
1026	eor	x0, x0, x0
1027
1028	ldp	d8,d9,[sp],#16
1029	ldp	x29,x30,[sp],#16
1030	AARCH64_VALIDATE_LINK_REGISTER
1031	ret
1032.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1033
1034.globl	vpaes_set_decrypt_key
1035.hidden	vpaes_set_decrypt_key
1036.type	vpaes_set_decrypt_key,%function
1037.align	4
1038vpaes_set_decrypt_key:
1039	AARCH64_SIGN_LINK_REGISTER
1040	stp	x29,x30,[sp,#-16]!
1041	add	x29,sp,#0
1042	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1043
1044	lsr	w9, w1, #5		// shr	$5,%eax
1045	add	w9, w9, #5		// $5,%eax
1046	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1047	lsl	w9, w9, #4		// shl	$4,%eax
1048	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1049	add	x2, x2, x9
1050
1051	mov	w3, #1		// mov	$1,%ecx
1052	lsr	w8, w1, #1		// shr	$1,%r8d
1053	and	x8, x8, #32		// and	$32,%r8d
1054	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1055	bl	_vpaes_schedule_core
1056
1057	ldp	d8,d9,[sp],#16
1058	ldp	x29,x30,[sp],#16
1059	AARCH64_VALIDATE_LINK_REGISTER
1060	ret
1061.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1062.globl	vpaes_cbc_encrypt
1063.hidden	vpaes_cbc_encrypt
1064.type	vpaes_cbc_encrypt,%function
1065.align	4
1066vpaes_cbc_encrypt:
1067	AARCH64_SIGN_LINK_REGISTER
1068	cbz	x2, .Lcbc_abort
1069	cmp	w5, #0			// check direction
1070	b.eq	vpaes_cbc_decrypt
1071
1072	stp	x29,x30,[sp,#-16]!
1073	add	x29,sp,#0
1074
1075	mov	x17, x2		// reassign
1076	mov	x2,  x3		// reassign
1077
1078	ld1	{v0.16b}, [x4]	// load ivec
1079	bl	_vpaes_encrypt_preheat
1080	b	.Lcbc_enc_loop
1081
1082.align	4
1083.Lcbc_enc_loop:
1084	ld1	{v7.16b}, [x0],#16	// load input
1085	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1086	bl	_vpaes_encrypt_core
1087	st1	{v0.16b}, [x1],#16	// save output
1088	subs	x17, x17, #16
1089	b.hi	.Lcbc_enc_loop
1090
1091	st1	{v0.16b}, [x4]	// write ivec
1092
1093	ldp	x29,x30,[sp],#16
1094.Lcbc_abort:
1095	AARCH64_VALIDATE_LINK_REGISTER
1096	ret
1097.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1098
1099.type	vpaes_cbc_decrypt,%function
1100.align	4
1101vpaes_cbc_decrypt:
1102	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1103	// only from vpaes_cbc_encrypt which has already signed the return address.
1104	stp	x29,x30,[sp,#-16]!
1105	add	x29,sp,#0
1106	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1107	stp	d10,d11,[sp,#-16]!
1108	stp	d12,d13,[sp,#-16]!
1109	stp	d14,d15,[sp,#-16]!
1110
1111	mov	x17, x2		// reassign
1112	mov	x2,  x3		// reassign
1113	ld1	{v6.16b}, [x4]	// load ivec
1114	bl	_vpaes_decrypt_preheat
1115	tst	x17, #16
1116	b.eq	.Lcbc_dec_loop2x
1117
1118	ld1	{v7.16b}, [x0], #16	// load input
1119	bl	_vpaes_decrypt_core
1120	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1121	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1122	st1	{v0.16b}, [x1], #16
1123	subs	x17, x17, #16
1124	b.ls	.Lcbc_dec_done
1125
1126.align	4
1127.Lcbc_dec_loop2x:
1128	ld1	{v14.16b,v15.16b}, [x0], #32
1129	bl	_vpaes_decrypt_2x
1130	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1131	eor	v1.16b, v1.16b, v14.16b
1132	orr	v6.16b, v15.16b, v15.16b
1133	st1	{v0.16b,v1.16b}, [x1], #32
1134	subs	x17, x17, #32
1135	b.hi	.Lcbc_dec_loop2x
1136
1137.Lcbc_dec_done:
1138	st1	{v6.16b}, [x4]
1139
1140	ldp	d14,d15,[sp],#16
1141	ldp	d12,d13,[sp],#16
1142	ldp	d10,d11,[sp],#16
1143	ldp	d8,d9,[sp],#16
1144	ldp	x29,x30,[sp],#16
1145	AARCH64_VALIDATE_LINK_REGISTER
1146	ret
1147.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1148.globl	vpaes_ctr32_encrypt_blocks
1149.hidden	vpaes_ctr32_encrypt_blocks
1150.type	vpaes_ctr32_encrypt_blocks,%function
1151.align	4
1152vpaes_ctr32_encrypt_blocks:
1153	AARCH64_SIGN_LINK_REGISTER
1154	stp	x29,x30,[sp,#-16]!
1155	add	x29,sp,#0
1156	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1157	stp	d10,d11,[sp,#-16]!
1158	stp	d12,d13,[sp,#-16]!
1159	stp	d14,d15,[sp,#-16]!
1160
1161	cbz	x2, .Lctr32_done
1162
1163	// Note, unlike the other functions, x2 here is measured in blocks,
1164	// not bytes.
1165	mov	x17, x2
1166	mov	x2,  x3
1167
1168	// Load the IV and counter portion.
1169	ldr	w6, [x4, #12]
1170	ld1	{v7.16b}, [x4]
1171
1172	bl	_vpaes_encrypt_preheat
1173	tst	x17, #1
1174	rev	w6, w6		// The counter is big-endian.
1175	b.eq	.Lctr32_prep_loop
1176
1177	// Handle one block so the remaining block count is even for
1178	// _vpaes_encrypt_2x.
1179	ld1	{v6.16b}, [x0], #16	// .Load input ahead of time
1180	bl	_vpaes_encrypt_core
1181	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
1182	st1	{v0.16b}, [x1], #16
1183	subs	x17, x17, #1
1184	// Update the counter.
1185	add	w6, w6, #1
1186	rev	w7, w6
1187	mov	v7.s[3], w7
1188	b.ls	.Lctr32_done
1189
1190.Lctr32_prep_loop:
1191	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
1192	// uses v14 and v15.
1193	mov	v15.16b, v7.16b
1194	mov	v14.16b, v7.16b
1195	add	w6, w6, #1
1196	rev	w7, w6
1197	mov	v15.s[3], w7
1198
1199.Lctr32_loop:
1200	ld1	{v6.16b,v7.16b}, [x0], #32	// .Load input ahead of time
1201	bl	_vpaes_encrypt_2x
1202	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
1203	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
1204	st1	{v0.16b,v1.16b}, [x1], #32
1205	subs	x17, x17, #2
1206	// Update the counter.
1207	add	w7, w6, #1
1208	add	w6, w6, #2
1209	rev	w7, w7
1210	mov	v14.s[3], w7
1211	rev	w7, w6
1212	mov	v15.s[3], w7
1213	b.hi	.Lctr32_loop
1214
1215.Lctr32_done:
1216	ldp	d14,d15,[sp],#16
1217	ldp	d12,d13,[sp],#16
1218	ldp	d10,d11,[sp],#16
1219	ldp	d8,d9,[sp],#16
1220	ldp	x29,x30,[sp],#16
1221	AARCH64_VALIDATE_LINK_REGISTER
1222	ret
1223.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
1224#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1225