xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/ghashv8-armv8-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.globl	gcm_init_v8
13
14.def gcm_init_v8
15   .type 32
16.endef
17.align	4
18gcm_init_v8:
19	AARCH64_VALID_CALL_TARGET
20	ld1	{v17.2d},[x1]		//load input H
21	movi	v19.16b,#0xe1
22	shl	v19.2d,v19.2d,#57		//0xc2.0
23	ext	v3.16b,v17.16b,v17.16b,#8
24	ushr	v18.2d,v19.2d,#63
25	dup	v17.4s,v17.s[1]
26	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
27	ushr	v18.2d,v3.2d,#63
28	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
29	and	v18.16b,v18.16b,v16.16b
30	shl	v3.2d,v3.2d,#1
31	ext	v18.16b,v18.16b,v18.16b,#8
32	and	v16.16b,v16.16b,v17.16b
33	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
34	eor	v20.16b,v3.16b,v16.16b		//twisted H
35	st1	{v20.2d},[x0],#16		//store Htable[0]
36
37	//calculate H^2
38	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
39	pmull	v0.1q,v20.1d,v20.1d
40	eor	v16.16b,v16.16b,v20.16b
41	pmull2	v2.1q,v20.2d,v20.2d
42	pmull	v1.1q,v16.1d,v16.1d
43
44	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
45	eor	v18.16b,v0.16b,v2.16b
46	eor	v1.16b,v1.16b,v17.16b
47	eor	v1.16b,v1.16b,v18.16b
48	pmull	v18.1q,v0.1d,v19.1d		//1st phase
49
50	ins	v2.d[0],v1.d[1]
51	ins	v1.d[1],v0.d[0]
52	eor	v0.16b,v1.16b,v18.16b
53
54	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
55	pmull	v0.1q,v0.1d,v19.1d
56	eor	v18.16b,v18.16b,v2.16b
57	eor	v22.16b,v0.16b,v18.16b
58
59	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
60	eor	v17.16b,v17.16b,v22.16b
61	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
62	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
63	//calculate H^3 and H^4
64	pmull	v0.1q,v20.1d, v22.1d
65	pmull	v5.1q,v22.1d,v22.1d
66	pmull2	v2.1q,v20.2d, v22.2d
67	pmull2	v7.1q,v22.2d,v22.2d
68	pmull	v1.1q,v16.1d,v17.1d
69	pmull	v6.1q,v17.1d,v17.1d
70
71	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
72	ext	v17.16b,v5.16b,v7.16b,#8
73	eor	v18.16b,v0.16b,v2.16b
74	eor	v1.16b,v1.16b,v16.16b
75	eor	v4.16b,v5.16b,v7.16b
76	eor	v6.16b,v6.16b,v17.16b
77	eor	v1.16b,v1.16b,v18.16b
78	pmull	v18.1q,v0.1d,v19.1d		//1st phase
79	eor	v6.16b,v6.16b,v4.16b
80	pmull	v4.1q,v5.1d,v19.1d
81
82	ins	v2.d[0],v1.d[1]
83	ins	v7.d[0],v6.d[1]
84	ins	v1.d[1],v0.d[0]
85	ins	v6.d[1],v5.d[0]
86	eor	v0.16b,v1.16b,v18.16b
87	eor	v5.16b,v6.16b,v4.16b
88
89	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
90	ext	v4.16b,v5.16b,v5.16b,#8
91	pmull	v0.1q,v0.1d,v19.1d
92	pmull	v5.1q,v5.1d,v19.1d
93	eor	v18.16b,v18.16b,v2.16b
94	eor	v4.16b,v4.16b,v7.16b
95	eor	v20.16b, v0.16b,v18.16b		//H^3
96	eor	v22.16b,v5.16b,v4.16b		//H^4
97
98	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
99	ext	v17.16b,v22.16b,v22.16b,#8
100	eor	v16.16b,v16.16b,v20.16b
101	eor	v17.16b,v17.16b,v22.16b
102	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
103	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
104	ret
105
106.globl	gcm_gmult_v8
107
108.def gcm_gmult_v8
109   .type 32
110.endef
111.align	4
112gcm_gmult_v8:
113	AARCH64_VALID_CALL_TARGET
114	ld1	{v17.2d},[x0]		//load Xi
115	movi	v19.16b,#0xe1
116	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
117	shl	v19.2d,v19.2d,#57
118#ifndef __AARCH64EB__
119	rev64	v17.16b,v17.16b
120#endif
121	ext	v3.16b,v17.16b,v17.16b,#8
122
123	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
124	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
125	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
126	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
127
128	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
129	eor	v18.16b,v0.16b,v2.16b
130	eor	v1.16b,v1.16b,v17.16b
131	eor	v1.16b,v1.16b,v18.16b
132	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
133
134	ins	v2.d[0],v1.d[1]
135	ins	v1.d[1],v0.d[0]
136	eor	v0.16b,v1.16b,v18.16b
137
138	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
139	pmull	v0.1q,v0.1d,v19.1d
140	eor	v18.16b,v18.16b,v2.16b
141	eor	v0.16b,v0.16b,v18.16b
142
143#ifndef __AARCH64EB__
144	rev64	v0.16b,v0.16b
145#endif
146	ext	v0.16b,v0.16b,v0.16b,#8
147	st1	{v0.2d},[x0]		//write out Xi
148
149	ret
150
151.globl	gcm_ghash_v8
152
153.def gcm_ghash_v8
154   .type 32
155.endef
156.align	4
157gcm_ghash_v8:
158	AARCH64_VALID_CALL_TARGET
159	cmp	x3,#64
160	b.hs	Lgcm_ghash_v8_4x
161	ld1	{v0.2d},[x0]		//load [rotated] Xi
162						//"[rotated]" means that
163						//loaded value would have
164						//to be rotated in order to
165						//make it appear as in
166						//algorithm specification
167	subs	x3,x3,#32		//see if x3 is 32 or larger
168	mov	x12,#16		//x12 is used as post-
169						//increment for input pointer;
170						//as loop is modulo-scheduled
171						//x12 is zeroed just in time
172						//to preclude overstepping
173						//inp[len], which means that
174						//last block[s] are actually
175						//loaded twice, but last
176						//copy is not processed
177	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
178	movi	v19.16b,#0xe1
179	ld1	{v22.2d},[x1]
180	csel	x12,xzr,x12,eq			//is it time to zero x12?
181	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
182	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
183	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
184#ifndef __AARCH64EB__
185	rev64	v16.16b,v16.16b
186	rev64	v0.16b,v0.16b
187#endif
188	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
189	b.lo	Lodd_tail_v8		//x3 was less than 32
190	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
191#ifndef __AARCH64EB__
192	rev64	v17.16b,v17.16b
193#endif
194	ext	v7.16b,v17.16b,v17.16b,#8
195	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
196	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
197	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
198	pmull2	v6.1q,v20.2d,v7.2d
199	b	Loop_mod2x_v8
200
201.align	4
202Loop_mod2x_v8:
203	ext	v18.16b,v3.16b,v3.16b,#8
204	subs	x3,x3,#32		//is there more data?
205	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
206	csel	x12,xzr,x12,lo			//is it time to zero x12?
207
208	pmull	v5.1q,v21.1d,v17.1d
209	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
210	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
211	eor	v0.16b,v0.16b,v4.16b		//accumulate
212	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
213	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
214
215	eor	v2.16b,v2.16b,v6.16b
216	csel	x12,xzr,x12,eq			//is it time to zero x12?
217	eor	v1.16b,v1.16b,v5.16b
218
219	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
220	eor	v18.16b,v0.16b,v2.16b
221	eor	v1.16b,v1.16b,v17.16b
222	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
223#ifndef __AARCH64EB__
224	rev64	v16.16b,v16.16b
225#endif
226	eor	v1.16b,v1.16b,v18.16b
227	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
228
229#ifndef __AARCH64EB__
230	rev64	v17.16b,v17.16b
231#endif
232	ins	v2.d[0],v1.d[1]
233	ins	v1.d[1],v0.d[0]
234	ext	v7.16b,v17.16b,v17.16b,#8
235	ext	v3.16b,v16.16b,v16.16b,#8
236	eor	v0.16b,v1.16b,v18.16b
237	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
238	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
239
240	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
241	pmull	v0.1q,v0.1d,v19.1d
242	eor	v3.16b,v3.16b,v18.16b
243	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
244	eor	v3.16b,v3.16b,v0.16b
245	pmull2	v6.1q,v20.2d,v7.2d
246	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
247
248	eor	v2.16b,v2.16b,v18.16b
249	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
250	adds	x3,x3,#32		//re-construct x3
251	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
252	b.eq	Ldone_v8		//is x3 zero?
253Lodd_tail_v8:
254	ext	v18.16b,v0.16b,v0.16b,#8
255	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
256	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
257
258	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
259	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
260	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
261	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
262
263	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
264	eor	v18.16b,v0.16b,v2.16b
265	eor	v1.16b,v1.16b,v17.16b
266	eor	v1.16b,v1.16b,v18.16b
267	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
268
269	ins	v2.d[0],v1.d[1]
270	ins	v1.d[1],v0.d[0]
271	eor	v0.16b,v1.16b,v18.16b
272
273	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
274	pmull	v0.1q,v0.1d,v19.1d
275	eor	v18.16b,v18.16b,v2.16b
276	eor	v0.16b,v0.16b,v18.16b
277
278Ldone_v8:
279#ifndef __AARCH64EB__
280	rev64	v0.16b,v0.16b
281#endif
282	ext	v0.16b,v0.16b,v0.16b,#8
283	st1	{v0.2d},[x0]		//write out Xi
284
285	ret
286
287.def gcm_ghash_v8_4x
288   .type 32
289.endef
290.align	4
291gcm_ghash_v8_4x:
292Lgcm_ghash_v8_4x:
293	ld1	{v0.2d},[x0]		//load [rotated] Xi
294	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
295	movi	v19.16b,#0xe1
296	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
297	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
298
299	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
300#ifndef __AARCH64EB__
301	rev64	v0.16b,v0.16b
302	rev64	v5.16b,v5.16b
303	rev64	v6.16b,v6.16b
304	rev64	v7.16b,v7.16b
305	rev64	v4.16b,v4.16b
306#endif
307	ext	v25.16b,v7.16b,v7.16b,#8
308	ext	v24.16b,v6.16b,v6.16b,#8
309	ext	v23.16b,v5.16b,v5.16b,#8
310
311	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
312	eor	v7.16b,v7.16b,v25.16b
313	pmull2	v31.1q,v20.2d,v25.2d
314	pmull	v30.1q,v21.1d,v7.1d
315
316	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
317	eor	v6.16b,v6.16b,v24.16b
318	pmull2	v24.1q,v22.2d,v24.2d
319	pmull2	v6.1q,v21.2d,v6.2d
320
321	eor	v29.16b,v29.16b,v16.16b
322	eor	v31.16b,v31.16b,v24.16b
323	eor	v30.16b,v30.16b,v6.16b
324
325	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
326	eor	v5.16b,v5.16b,v23.16b
327	pmull2	v23.1q,v26.2d,v23.2d
328	pmull	v5.1q,v27.1d,v5.1d
329
330	eor	v29.16b,v29.16b,v7.16b
331	eor	v31.16b,v31.16b,v23.16b
332	eor	v30.16b,v30.16b,v5.16b
333
334	subs	x3,x3,#128
335	b.lo	Ltail4x
336
337	b	Loop4x
338
339.align	4
340Loop4x:
341	eor	v16.16b,v4.16b,v0.16b
342	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
343	ext	v3.16b,v16.16b,v16.16b,#8
344#ifndef __AARCH64EB__
345	rev64	v5.16b,v5.16b
346	rev64	v6.16b,v6.16b
347	rev64	v7.16b,v7.16b
348	rev64	v4.16b,v4.16b
349#endif
350
351	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
352	eor	v16.16b,v16.16b,v3.16b
353	pmull2	v2.1q,v28.2d,v3.2d
354	ext	v25.16b,v7.16b,v7.16b,#8
355	pmull2	v1.1q,v27.2d,v16.2d
356
357	eor	v0.16b,v0.16b,v29.16b
358	eor	v2.16b,v2.16b,v31.16b
359	ext	v24.16b,v6.16b,v6.16b,#8
360	eor	v1.16b,v1.16b,v30.16b
361	ext	v23.16b,v5.16b,v5.16b,#8
362
363	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
364	eor	v18.16b,v0.16b,v2.16b
365	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
366	eor	v7.16b,v7.16b,v25.16b
367	eor	v1.16b,v1.16b,v17.16b
368	pmull2	v31.1q,v20.2d,v25.2d
369	eor	v1.16b,v1.16b,v18.16b
370	pmull	v30.1q,v21.1d,v7.1d
371
372	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
373	ins	v2.d[0],v1.d[1]
374	ins	v1.d[1],v0.d[0]
375	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
376	eor	v6.16b,v6.16b,v24.16b
377	pmull2	v24.1q,v22.2d,v24.2d
378	eor	v0.16b,v1.16b,v18.16b
379	pmull2	v6.1q,v21.2d,v6.2d
380
381	eor	v29.16b,v29.16b,v16.16b
382	eor	v31.16b,v31.16b,v24.16b
383	eor	v30.16b,v30.16b,v6.16b
384
385	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
386	pmull	v0.1q,v0.1d,v19.1d
387	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
388	eor	v5.16b,v5.16b,v23.16b
389	eor	v18.16b,v18.16b,v2.16b
390	pmull2	v23.1q,v26.2d,v23.2d
391	pmull	v5.1q,v27.1d,v5.1d
392
393	eor	v0.16b,v0.16b,v18.16b
394	eor	v29.16b,v29.16b,v7.16b
395	eor	v31.16b,v31.16b,v23.16b
396	ext	v0.16b,v0.16b,v0.16b,#8
397	eor	v30.16b,v30.16b,v5.16b
398
399	subs	x3,x3,#64
400	b.hs	Loop4x
401
402Ltail4x:
403	eor	v16.16b,v4.16b,v0.16b
404	ext	v3.16b,v16.16b,v16.16b,#8
405
406	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
407	eor	v16.16b,v16.16b,v3.16b
408	pmull2	v2.1q,v28.2d,v3.2d
409	pmull2	v1.1q,v27.2d,v16.2d
410
411	eor	v0.16b,v0.16b,v29.16b
412	eor	v2.16b,v2.16b,v31.16b
413	eor	v1.16b,v1.16b,v30.16b
414
415	adds	x3,x3,#64
416	b.eq	Ldone4x
417
418	cmp	x3,#32
419	b.lo	Lone
420	b.eq	Ltwo
421Lthree:
422	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
423	eor	v18.16b,v0.16b,v2.16b
424	eor	v1.16b,v1.16b,v17.16b
425	ld1	{v4.2d,v5.2d,v6.2d},[x2]
426	eor	v1.16b,v1.16b,v18.16b
427#ifndef	__AARCH64EB__
428	rev64	v5.16b,v5.16b
429	rev64	v6.16b,v6.16b
430	rev64	v4.16b,v4.16b
431#endif
432
433	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
434	ins	v2.d[0],v1.d[1]
435	ins	v1.d[1],v0.d[0]
436	ext	v24.16b,v6.16b,v6.16b,#8
437	ext	v23.16b,v5.16b,v5.16b,#8
438	eor	v0.16b,v1.16b,v18.16b
439
440	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
441	eor	v6.16b,v6.16b,v24.16b
442
443	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
444	pmull	v0.1q,v0.1d,v19.1d
445	eor	v18.16b,v18.16b,v2.16b
446	pmull2	v31.1q,v20.2d,v24.2d
447	pmull	v30.1q,v21.1d,v6.1d
448	eor	v0.16b,v0.16b,v18.16b
449	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
450	eor	v5.16b,v5.16b,v23.16b
451	ext	v0.16b,v0.16b,v0.16b,#8
452
453	pmull2	v23.1q,v22.2d,v23.2d
454	eor	v16.16b,v4.16b,v0.16b
455	pmull2	v5.1q,v21.2d,v5.2d
456	ext	v3.16b,v16.16b,v16.16b,#8
457
458	eor	v29.16b,v29.16b,v7.16b
459	eor	v31.16b,v31.16b,v23.16b
460	eor	v30.16b,v30.16b,v5.16b
461
462	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
463	eor	v16.16b,v16.16b,v3.16b
464	pmull2	v2.1q,v26.2d,v3.2d
465	pmull	v1.1q,v27.1d,v16.1d
466
467	eor	v0.16b,v0.16b,v29.16b
468	eor	v2.16b,v2.16b,v31.16b
469	eor	v1.16b,v1.16b,v30.16b
470	b	Ldone4x
471
472.align	4
473Ltwo:
474	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
475	eor	v18.16b,v0.16b,v2.16b
476	eor	v1.16b,v1.16b,v17.16b
477	ld1	{v4.2d,v5.2d},[x2]
478	eor	v1.16b,v1.16b,v18.16b
479#ifndef	__AARCH64EB__
480	rev64	v5.16b,v5.16b
481	rev64	v4.16b,v4.16b
482#endif
483
484	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
485	ins	v2.d[0],v1.d[1]
486	ins	v1.d[1],v0.d[0]
487	ext	v23.16b,v5.16b,v5.16b,#8
488	eor	v0.16b,v1.16b,v18.16b
489
490	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
491	pmull	v0.1q,v0.1d,v19.1d
492	eor	v18.16b,v18.16b,v2.16b
493	eor	v0.16b,v0.16b,v18.16b
494	ext	v0.16b,v0.16b,v0.16b,#8
495
496	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
497	eor	v5.16b,v5.16b,v23.16b
498
499	eor	v16.16b,v4.16b,v0.16b
500	ext	v3.16b,v16.16b,v16.16b,#8
501
502	pmull2	v31.1q,v20.2d,v23.2d
503	pmull	v30.1q,v21.1d,v5.1d
504
505	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
506	eor	v16.16b,v16.16b,v3.16b
507	pmull2	v2.1q,v22.2d,v3.2d
508	pmull2	v1.1q,v21.2d,v16.2d
509
510	eor	v0.16b,v0.16b,v29.16b
511	eor	v2.16b,v2.16b,v31.16b
512	eor	v1.16b,v1.16b,v30.16b
513	b	Ldone4x
514
515.align	4
516Lone:
517	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
518	eor	v18.16b,v0.16b,v2.16b
519	eor	v1.16b,v1.16b,v17.16b
520	ld1	{v4.2d},[x2]
521	eor	v1.16b,v1.16b,v18.16b
522#ifndef	__AARCH64EB__
523	rev64	v4.16b,v4.16b
524#endif
525
526	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
527	ins	v2.d[0],v1.d[1]
528	ins	v1.d[1],v0.d[0]
529	eor	v0.16b,v1.16b,v18.16b
530
531	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
532	pmull	v0.1q,v0.1d,v19.1d
533	eor	v18.16b,v18.16b,v2.16b
534	eor	v0.16b,v0.16b,v18.16b
535	ext	v0.16b,v0.16b,v0.16b,#8
536
537	eor	v16.16b,v4.16b,v0.16b
538	ext	v3.16b,v16.16b,v16.16b,#8
539
540	pmull	v0.1q,v20.1d,v3.1d
541	eor	v16.16b,v16.16b,v3.16b
542	pmull2	v2.1q,v20.2d,v3.2d
543	pmull	v1.1q,v21.1d,v16.1d
544
545Ldone4x:
546	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
547	eor	v18.16b,v0.16b,v2.16b
548	eor	v1.16b,v1.16b,v17.16b
549	eor	v1.16b,v1.16b,v18.16b
550
551	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
552	ins	v2.d[0],v1.d[1]
553	ins	v1.d[1],v0.d[0]
554	eor	v0.16b,v1.16b,v18.16b
555
556	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
557	pmull	v0.1q,v0.1d,v19.1d
558	eor	v18.16b,v18.16b,v2.16b
559	eor	v0.16b,v0.16b,v18.16b
560	ext	v0.16b,v0.16b,v0.16b,#8
561
562#ifndef __AARCH64EB__
563	rev64	v0.16b,v0.16b
564#endif
565	st1	{v0.2d},[x0]		//write out Xi
566
567	ret
568
569.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
570.align	2
571.align	2
572#endif
573#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
574