xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/armv8-mont-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8
9.text
10
11.globl	bn_mul_mont
12
13.def bn_mul_mont
14   .type 32
15.endef
16.align	5
17bn_mul_mont:
18	AARCH64_SIGN_LINK_REGISTER
19	tst	x5,#7
20	b.eq	__bn_sqr8x_mont
21	tst	x5,#3
22	b.eq	__bn_mul4x_mont
23Lmul_mont:
24	stp	x29,x30,[sp,#-64]!
25	add	x29,sp,#0
26	stp	x19,x20,[sp,#16]
27	stp	x21,x22,[sp,#32]
28	stp	x23,x24,[sp,#48]
29
30	ldr	x9,[x2],#8		// bp[0]
31	sub	x22,sp,x5,lsl#3
32	ldp	x7,x8,[x1],#16	// ap[0..1]
33	lsl	x5,x5,#3
34	ldr	x4,[x4]		// *n0
35	and	x22,x22,#-16		// ABI says so
36	ldp	x13,x14,[x3],#16	// np[0..1]
37
38	mul	x6,x7,x9		// ap[0]*bp[0]
39	sub	x21,x5,#16		// j=num-2
40	umulh	x7,x7,x9
41	mul	x10,x8,x9		// ap[1]*bp[0]
42	umulh	x11,x8,x9
43
44	mul	x15,x6,x4		// "tp[0]"*n0
45	mov	sp,x22			// alloca
46
47	// (*)	mul	x12,x13,x15	// np[0]*m1
48	umulh	x13,x13,x15
49	mul	x16,x14,x15		// np[1]*m1
50	// (*)	adds	x12,x12,x6	// discarded
51	// (*)	As for removal of first multiplication and addition
52	//	instructions. The outcome of first addition is
53	//	guaranteed to be zero, which leaves two computationally
54	//	significant outcomes: it either carries or not. Then
55	//	question is when does it carry? Is there alternative
56	//	way to deduce it? If you follow operations, you can
57	//	observe that condition for carry is quite simple:
58	//	x6 being non-zero. So that carry can be calculated
59	//	by adding -1 to x6. That's what next instruction does.
60	subs	xzr,x6,#1		// (*)
61	umulh	x17,x14,x15
62	adc	x13,x13,xzr
63	cbz	x21,L1st_skip
64
65L1st:
66	ldr	x8,[x1],#8
67	adds	x6,x10,x7
68	sub	x21,x21,#8		// j--
69	adc	x7,x11,xzr
70
71	ldr	x14,[x3],#8
72	adds	x12,x16,x13
73	mul	x10,x8,x9		// ap[j]*bp[0]
74	adc	x13,x17,xzr
75	umulh	x11,x8,x9
76
77	adds	x12,x12,x6
78	mul	x16,x14,x15		// np[j]*m1
79	adc	x13,x13,xzr
80	umulh	x17,x14,x15
81	str	x12,[x22],#8		// tp[j-1]
82	cbnz	x21,L1st
83
84L1st_skip:
85	adds	x6,x10,x7
86	sub	x1,x1,x5		// rewind x1
87	adc	x7,x11,xzr
88
89	adds	x12,x16,x13
90	sub	x3,x3,x5		// rewind x3
91	adc	x13,x17,xzr
92
93	adds	x12,x12,x6
94	sub	x20,x5,#8		// i=num-1
95	adcs	x13,x13,x7
96
97	adc	x19,xzr,xzr		// upmost overflow bit
98	stp	x12,x13,[x22]
99
100Louter:
101	ldr	x9,[x2],#8		// bp[i]
102	ldp	x7,x8,[x1],#16
103	ldr	x23,[sp]		// tp[0]
104	add	x22,sp,#8
105
106	mul	x6,x7,x9		// ap[0]*bp[i]
107	sub	x21,x5,#16		// j=num-2
108	umulh	x7,x7,x9
109	ldp	x13,x14,[x3],#16
110	mul	x10,x8,x9		// ap[1]*bp[i]
111	adds	x6,x6,x23
112	umulh	x11,x8,x9
113	adc	x7,x7,xzr
114
115	mul	x15,x6,x4
116	sub	x20,x20,#8		// i--
117
118	// (*)	mul	x12,x13,x15	// np[0]*m1
119	umulh	x13,x13,x15
120	mul	x16,x14,x15		// np[1]*m1
121	// (*)	adds	x12,x12,x6
122	subs	xzr,x6,#1		// (*)
123	umulh	x17,x14,x15
124	cbz	x21,Linner_skip
125
126Linner:
127	ldr	x8,[x1],#8
128	adc	x13,x13,xzr
129	ldr	x23,[x22],#8		// tp[j]
130	adds	x6,x10,x7
131	sub	x21,x21,#8		// j--
132	adc	x7,x11,xzr
133
134	adds	x12,x16,x13
135	ldr	x14,[x3],#8
136	adc	x13,x17,xzr
137
138	mul	x10,x8,x9		// ap[j]*bp[i]
139	adds	x6,x6,x23
140	umulh	x11,x8,x9
141	adc	x7,x7,xzr
142
143	mul	x16,x14,x15		// np[j]*m1
144	adds	x12,x12,x6
145	umulh	x17,x14,x15
146	str	x12,[x22,#-16]		// tp[j-1]
147	cbnz	x21,Linner
148
149Linner_skip:
150	ldr	x23,[x22],#8		// tp[j]
151	adc	x13,x13,xzr
152	adds	x6,x10,x7
153	sub	x1,x1,x5		// rewind x1
154	adc	x7,x11,xzr
155
156	adds	x12,x16,x13
157	sub	x3,x3,x5		// rewind x3
158	adcs	x13,x17,x19
159	adc	x19,xzr,xzr
160
161	adds	x6,x6,x23
162	adc	x7,x7,xzr
163
164	adds	x12,x12,x6
165	adcs	x13,x13,x7
166	adc	x19,x19,xzr		// upmost overflow bit
167	stp	x12,x13,[x22,#-16]
168
169	cbnz	x20,Louter
170
171	// Final step. We see if result is larger than modulus, and
172	// if it is, subtract the modulus. But comparison implies
173	// subtraction. So we subtract modulus, see if it borrowed,
174	// and conditionally copy original value.
175	ldr	x23,[sp]		// tp[0]
176	add	x22,sp,#8
177	ldr	x14,[x3],#8		// np[0]
178	subs	x21,x5,#8		// j=num-1 and clear borrow
179	mov	x1,x0
180Lsub:
181	sbcs	x8,x23,x14		// tp[j]-np[j]
182	ldr	x23,[x22],#8
183	sub	x21,x21,#8		// j--
184	ldr	x14,[x3],#8
185	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
186	cbnz	x21,Lsub
187
188	sbcs	x8,x23,x14
189	sbcs	x19,x19,xzr		// did it borrow?
190	str	x8,[x1],#8		// rp[num-1]
191
192	ldr	x23,[sp]		// tp[0]
193	add	x22,sp,#8
194	ldr	x8,[x0],#8		// rp[0]
195	sub	x5,x5,#8		// num--
196	nop
197Lcond_copy:
198	sub	x5,x5,#8		// num--
199	csel	x14,x23,x8,lo		// did it borrow?
200	ldr	x23,[x22],#8
201	ldr	x8,[x0],#8
202	str	xzr,[x22,#-16]		// wipe tp
203	str	x14,[x0,#-16]
204	cbnz	x5,Lcond_copy
205
206	csel	x14,x23,x8,lo
207	str	xzr,[x22,#-8]		// wipe tp
208	str	x14,[x0,#-8]
209
210	ldp	x19,x20,[x29,#16]
211	mov	sp,x29
212	ldp	x21,x22,[x29,#32]
213	mov	x0,#1
214	ldp	x23,x24,[x29,#48]
215	ldr	x29,[sp],#64
216	AARCH64_VALIDATE_LINK_REGISTER
217	ret
218
219.def __bn_sqr8x_mont
220   .type 32
221.endef
222.align	5
223__bn_sqr8x_mont:
224	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
225	// only from bn_mul_mont which has already signed the return address.
226	cmp	x1,x2
227	b.ne	__bn_mul4x_mont
228Lsqr8x_mont:
229	stp	x29,x30,[sp,#-128]!
230	add	x29,sp,#0
231	stp	x19,x20,[sp,#16]
232	stp	x21,x22,[sp,#32]
233	stp	x23,x24,[sp,#48]
234	stp	x25,x26,[sp,#64]
235	stp	x27,x28,[sp,#80]
236	stp	x0,x3,[sp,#96]	// offload rp and np
237
238	ldp	x6,x7,[x1,#8*0]
239	ldp	x8,x9,[x1,#8*2]
240	ldp	x10,x11,[x1,#8*4]
241	ldp	x12,x13,[x1,#8*6]
242
243	sub	x2,sp,x5,lsl#4
244	lsl	x5,x5,#3
245	ldr	x4,[x4]		// *n0
246	mov	sp,x2			// alloca
247	sub	x27,x5,#8*8
248	b	Lsqr8x_zero_start
249
250Lsqr8x_zero:
251	sub	x27,x27,#8*8
252	stp	xzr,xzr,[x2,#8*0]
253	stp	xzr,xzr,[x2,#8*2]
254	stp	xzr,xzr,[x2,#8*4]
255	stp	xzr,xzr,[x2,#8*6]
256Lsqr8x_zero_start:
257	stp	xzr,xzr,[x2,#8*8]
258	stp	xzr,xzr,[x2,#8*10]
259	stp	xzr,xzr,[x2,#8*12]
260	stp	xzr,xzr,[x2,#8*14]
261	add	x2,x2,#8*16
262	cbnz	x27,Lsqr8x_zero
263
264	add	x3,x1,x5
265	add	x1,x1,#8*8
266	mov	x19,xzr
267	mov	x20,xzr
268	mov	x21,xzr
269	mov	x22,xzr
270	mov	x23,xzr
271	mov	x24,xzr
272	mov	x25,xzr
273	mov	x26,xzr
274	mov	x2,sp
275	str	x4,[x29,#112]		// offload n0
276
277	// Multiply everything but a[i]*a[i]
278.align	4
279Lsqr8x_outer_loop:
280        //                                                 a[1]a[0]	(i)
281        //                                             a[2]a[0]
282        //                                         a[3]a[0]
283        //                                     a[4]a[0]
284        //                                 a[5]a[0]
285        //                             a[6]a[0]
286        //                         a[7]a[0]
287        //                                         a[2]a[1]		(ii)
288        //                                     a[3]a[1]
289        //                                 a[4]a[1]
290        //                             a[5]a[1]
291        //                         a[6]a[1]
292        //                     a[7]a[1]
293        //                                 a[3]a[2]			(iii)
294        //                             a[4]a[2]
295        //                         a[5]a[2]
296        //                     a[6]a[2]
297        //                 a[7]a[2]
298        //                         a[4]a[3]				(iv)
299        //                     a[5]a[3]
300        //                 a[6]a[3]
301        //             a[7]a[3]
302        //                 a[5]a[4]					(v)
303        //             a[6]a[4]
304        //         a[7]a[4]
305        //         a[6]a[5]						(vi)
306        //     a[7]a[5]
307        // a[7]a[6]							(vii)
308
309	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
310	mul	x15,x8,x6
311	mul	x16,x9,x6
312	mul	x17,x10,x6
313	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
314	mul	x14,x11,x6
315	adcs	x21,x21,x15
316	mul	x15,x12,x6
317	adcs	x22,x22,x16
318	mul	x16,x13,x6
319	adcs	x23,x23,x17
320	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
321	adcs	x24,x24,x14
322	umulh	x14,x8,x6
323	adcs	x25,x25,x15
324	umulh	x15,x9,x6
325	adcs	x26,x26,x16
326	umulh	x16,x10,x6
327	stp	x19,x20,[x2],#8*2	// t[0..1]
328	adc	x19,xzr,xzr		// t[8]
329	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
330	umulh	x17,x11,x6
331	adcs	x22,x22,x14
332	umulh	x14,x12,x6
333	adcs	x23,x23,x15
334	umulh	x15,x13,x6
335	adcs	x24,x24,x16
336	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
337	adcs	x25,x25,x17
338	mul	x17,x9,x7
339	adcs	x26,x26,x14
340	mul	x14,x10,x7
341	adc	x19,x19,x15
342
343	mul	x15,x11,x7
344	adds	x22,x22,x16
345	mul	x16,x12,x7
346	adcs	x23,x23,x17
347	mul	x17,x13,x7
348	adcs	x24,x24,x14
349	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
350	adcs	x25,x25,x15
351	umulh	x15,x9,x7
352	adcs	x26,x26,x16
353	umulh	x16,x10,x7
354	adcs	x19,x19,x17
355	umulh	x17,x11,x7
356	stp	x21,x22,[x2],#8*2	// t[2..3]
357	adc	x20,xzr,xzr		// t[9]
358	adds	x23,x23,x14
359	umulh	x14,x12,x7
360	adcs	x24,x24,x15
361	umulh	x15,x13,x7
362	adcs	x25,x25,x16
363	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
364	adcs	x26,x26,x17
365	mul	x17,x10,x8
366	adcs	x19,x19,x14
367	mul	x14,x11,x8
368	adc	x20,x20,x15
369
370	mul	x15,x12,x8
371	adds	x24,x24,x16
372	mul	x16,x13,x8
373	adcs	x25,x25,x17
374	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
375	adcs	x26,x26,x14
376	umulh	x14,x10,x8
377	adcs	x19,x19,x15
378	umulh	x15,x11,x8
379	adcs	x20,x20,x16
380	umulh	x16,x12,x8
381	stp	x23,x24,[x2],#8*2	// t[4..5]
382	adc	x21,xzr,xzr		// t[10]
383	adds	x25,x25,x17
384	umulh	x17,x13,x8
385	adcs	x26,x26,x14
386	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
387	adcs	x19,x19,x15
388	mul	x15,x11,x9
389	adcs	x20,x20,x16
390	mul	x16,x12,x9
391	adc	x21,x21,x17
392
393	mul	x17,x13,x9
394	adds	x26,x26,x14
395	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
396	adcs	x19,x19,x15
397	umulh	x15,x11,x9
398	adcs	x20,x20,x16
399	umulh	x16,x12,x9
400	adcs	x21,x21,x17
401	umulh	x17,x13,x9
402	stp	x25,x26,[x2],#8*2	// t[6..7]
403	adc	x22,xzr,xzr		// t[11]
404	adds	x19,x19,x14
405	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
406	adcs	x20,x20,x15
407	mul	x15,x12,x10
408	adcs	x21,x21,x16
409	mul	x16,x13,x10
410	adc	x22,x22,x17
411
412	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
413	adds	x20,x20,x14
414	umulh	x14,x12,x10
415	adcs	x21,x21,x15
416	umulh	x15,x13,x10
417	adcs	x22,x22,x16
418	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
419	adc	x23,xzr,xzr		// t[12]
420	adds	x21,x21,x17
421	mul	x17,x13,x11
422	adcs	x22,x22,x14
423	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
424	adc	x23,x23,x15
425
426	umulh	x15,x13,x11
427	adds	x22,x22,x16
428	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
429	adcs	x23,x23,x17
430	umulh	x17,x13,x12		// hi(a[7]*a[6])
431	adc	x24,xzr,xzr		// t[13]
432	adds	x23,x23,x14
433	sub	x27,x3,x1	// done yet?
434	adc	x24,x24,x15
435
436	adds	x24,x24,x16
437	sub	x14,x3,x5	// rewinded ap
438	adc	x25,xzr,xzr		// t[14]
439	add	x25,x25,x17
440
441	cbz	x27,Lsqr8x_outer_break
442
443	mov	x4,x6
444	ldp	x6,x7,[x2,#8*0]
445	ldp	x8,x9,[x2,#8*2]
446	ldp	x10,x11,[x2,#8*4]
447	ldp	x12,x13,[x2,#8*6]
448	adds	x19,x19,x6
449	adcs	x20,x20,x7
450	ldp	x6,x7,[x1,#8*0]
451	adcs	x21,x21,x8
452	adcs	x22,x22,x9
453	ldp	x8,x9,[x1,#8*2]
454	adcs	x23,x23,x10
455	adcs	x24,x24,x11
456	ldp	x10,x11,[x1,#8*4]
457	adcs	x25,x25,x12
458	mov	x0,x1
459	adcs	x26,xzr,x13
460	ldp	x12,x13,[x1,#8*6]
461	add	x1,x1,#8*8
462	//adc	x28,xzr,xzr		// moved below
463	mov	x27,#-8*8
464
465	//                                                         a[8]a[0]
466	//                                                     a[9]a[0]
467	//                                                 a[a]a[0]
468	//                                             a[b]a[0]
469	//                                         a[c]a[0]
470	//                                     a[d]a[0]
471	//                                 a[e]a[0]
472	//                             a[f]a[0]
473	//                                                     a[8]a[1]
474	//                         a[f]a[1]........................
475	//                                                 a[8]a[2]
476	//                     a[f]a[2]........................
477	//                                             a[8]a[3]
478	//                 a[f]a[3]........................
479	//                                         a[8]a[4]
480	//             a[f]a[4]........................
481	//                                     a[8]a[5]
482	//         a[f]a[5]........................
483	//                                 a[8]a[6]
484	//     a[f]a[6]........................
485	//                             a[8]a[7]
486	// a[f]a[7]........................
487Lsqr8x_mul:
488	mul	x14,x6,x4
489	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
490	mul	x15,x7,x4
491	add	x27,x27,#8
492	mul	x16,x8,x4
493	mul	x17,x9,x4
494	adds	x19,x19,x14
495	mul	x14,x10,x4
496	adcs	x20,x20,x15
497	mul	x15,x11,x4
498	adcs	x21,x21,x16
499	mul	x16,x12,x4
500	adcs	x22,x22,x17
501	mul	x17,x13,x4
502	adcs	x23,x23,x14
503	umulh	x14,x6,x4
504	adcs	x24,x24,x15
505	umulh	x15,x7,x4
506	adcs	x25,x25,x16
507	umulh	x16,x8,x4
508	adcs	x26,x26,x17
509	umulh	x17,x9,x4
510	adc	x28,x28,xzr
511	str	x19,[x2],#8
512	adds	x19,x20,x14
513	umulh	x14,x10,x4
514	adcs	x20,x21,x15
515	umulh	x15,x11,x4
516	adcs	x21,x22,x16
517	umulh	x16,x12,x4
518	adcs	x22,x23,x17
519	umulh	x17,x13,x4
520	ldr	x4,[x0,x27]
521	adcs	x23,x24,x14
522	adcs	x24,x25,x15
523	adcs	x25,x26,x16
524	adcs	x26,x28,x17
525	//adc	x28,xzr,xzr		// moved above
526	cbnz	x27,Lsqr8x_mul
527					// note that carry flag is guaranteed
528					// to be zero at this point
529	cmp	x1,x3		// done yet?
530	b.eq	Lsqr8x_break
531
532	ldp	x6,x7,[x2,#8*0]
533	ldp	x8,x9,[x2,#8*2]
534	ldp	x10,x11,[x2,#8*4]
535	ldp	x12,x13,[x2,#8*6]
536	adds	x19,x19,x6
537	ldr	x4,[x0,#-8*8]
538	adcs	x20,x20,x7
539	ldp	x6,x7,[x1,#8*0]
540	adcs	x21,x21,x8
541	adcs	x22,x22,x9
542	ldp	x8,x9,[x1,#8*2]
543	adcs	x23,x23,x10
544	adcs	x24,x24,x11
545	ldp	x10,x11,[x1,#8*4]
546	adcs	x25,x25,x12
547	mov	x27,#-8*8
548	adcs	x26,x26,x13
549	ldp	x12,x13,[x1,#8*6]
550	add	x1,x1,#8*8
551	//adc	x28,xzr,xzr		// moved above
552	b	Lsqr8x_mul
553
554.align	4
555Lsqr8x_break:
556	ldp	x6,x7,[x0,#8*0]
557	add	x1,x0,#8*8
558	ldp	x8,x9,[x0,#8*2]
559	sub	x14,x3,x1		// is it last iteration?
560	ldp	x10,x11,[x0,#8*4]
561	sub	x15,x2,x14
562	ldp	x12,x13,[x0,#8*6]
563	cbz	x14,Lsqr8x_outer_loop
564
565	stp	x19,x20,[x2,#8*0]
566	ldp	x19,x20,[x15,#8*0]
567	stp	x21,x22,[x2,#8*2]
568	ldp	x21,x22,[x15,#8*2]
569	stp	x23,x24,[x2,#8*4]
570	ldp	x23,x24,[x15,#8*4]
571	stp	x25,x26,[x2,#8*6]
572	mov	x2,x15
573	ldp	x25,x26,[x15,#8*6]
574	b	Lsqr8x_outer_loop
575
576.align	4
577Lsqr8x_outer_break:
578	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
579	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
580	ldp	x15,x16,[sp,#8*1]
581	ldp	x11,x13,[x14,#8*2]
582	add	x1,x14,#8*4
583	ldp	x17,x14,[sp,#8*3]
584
585	stp	x19,x20,[x2,#8*0]
586	mul	x19,x7,x7
587	stp	x21,x22,[x2,#8*2]
588	umulh	x7,x7,x7
589	stp	x23,x24,[x2,#8*4]
590	mul	x8,x9,x9
591	stp	x25,x26,[x2,#8*6]
592	mov	x2,sp
593	umulh	x9,x9,x9
594	adds	x20,x7,x15,lsl#1
595	extr	x15,x16,x15,#63
596	sub	x27,x5,#8*4
597
598Lsqr4x_shift_n_add:
599	adcs	x21,x8,x15
600	extr	x16,x17,x16,#63
601	sub	x27,x27,#8*4
602	adcs	x22,x9,x16
603	ldp	x15,x16,[x2,#8*5]
604	mul	x10,x11,x11
605	ldp	x7,x9,[x1],#8*2
606	umulh	x11,x11,x11
607	mul	x12,x13,x13
608	umulh	x13,x13,x13
609	extr	x17,x14,x17,#63
610	stp	x19,x20,[x2,#8*0]
611	adcs	x23,x10,x17
612	extr	x14,x15,x14,#63
613	stp	x21,x22,[x2,#8*2]
614	adcs	x24,x11,x14
615	ldp	x17,x14,[x2,#8*7]
616	extr	x15,x16,x15,#63
617	adcs	x25,x12,x15
618	extr	x16,x17,x16,#63
619	adcs	x26,x13,x16
620	ldp	x15,x16,[x2,#8*9]
621	mul	x6,x7,x7
622	ldp	x11,x13,[x1],#8*2
623	umulh	x7,x7,x7
624	mul	x8,x9,x9
625	umulh	x9,x9,x9
626	stp	x23,x24,[x2,#8*4]
627	extr	x17,x14,x17,#63
628	stp	x25,x26,[x2,#8*6]
629	add	x2,x2,#8*8
630	adcs	x19,x6,x17
631	extr	x14,x15,x14,#63
632	adcs	x20,x7,x14
633	ldp	x17,x14,[x2,#8*3]
634	extr	x15,x16,x15,#63
635	cbnz	x27,Lsqr4x_shift_n_add
636	ldp	x1,x4,[x29,#104]	// pull np and n0
637
638	adcs	x21,x8,x15
639	extr	x16,x17,x16,#63
640	adcs	x22,x9,x16
641	ldp	x15,x16,[x2,#8*5]
642	mul	x10,x11,x11
643	umulh	x11,x11,x11
644	stp	x19,x20,[x2,#8*0]
645	mul	x12,x13,x13
646	umulh	x13,x13,x13
647	stp	x21,x22,[x2,#8*2]
648	extr	x17,x14,x17,#63
649	adcs	x23,x10,x17
650	extr	x14,x15,x14,#63
651	ldp	x19,x20,[sp,#8*0]
652	adcs	x24,x11,x14
653	extr	x15,x16,x15,#63
654	ldp	x6,x7,[x1,#8*0]
655	adcs	x25,x12,x15
656	extr	x16,xzr,x16,#63
657	ldp	x8,x9,[x1,#8*2]
658	adc	x26,x13,x16
659	ldp	x10,x11,[x1,#8*4]
660
661	// Reduce by 512 bits per iteration
662	mul	x28,x4,x19		// t[0]*n0
663	ldp	x12,x13,[x1,#8*6]
664	add	x3,x1,x5
665	ldp	x21,x22,[sp,#8*2]
666	stp	x23,x24,[x2,#8*4]
667	ldp	x23,x24,[sp,#8*4]
668	stp	x25,x26,[x2,#8*6]
669	ldp	x25,x26,[sp,#8*6]
670	add	x1,x1,#8*8
671	mov	x30,xzr		// initial top-most carry
672	mov	x2,sp
673	mov	x27,#8
674
675Lsqr8x_reduction:
676	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
677	mul	x15,x7,x28
678	sub	x27,x27,#1
679	mul	x16,x8,x28
680	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
681	mul	x17,x9,x28
682	// (*)	adds	xzr,x19,x14
683	subs	xzr,x19,#1		// (*)
684	mul	x14,x10,x28
685	adcs	x19,x20,x15
686	mul	x15,x11,x28
687	adcs	x20,x21,x16
688	mul	x16,x12,x28
689	adcs	x21,x22,x17
690	mul	x17,x13,x28
691	adcs	x22,x23,x14
692	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
693	adcs	x23,x24,x15
694	umulh	x15,x7,x28
695	adcs	x24,x25,x16
696	umulh	x16,x8,x28
697	adcs	x25,x26,x17
698	umulh	x17,x9,x28
699	adc	x26,xzr,xzr
700	adds	x19,x19,x14
701	umulh	x14,x10,x28
702	adcs	x20,x20,x15
703	umulh	x15,x11,x28
704	adcs	x21,x21,x16
705	umulh	x16,x12,x28
706	adcs	x22,x22,x17
707	umulh	x17,x13,x28
708	mul	x28,x4,x19		// next t[0]*n0
709	adcs	x23,x23,x14
710	adcs	x24,x24,x15
711	adcs	x25,x25,x16
712	adc	x26,x26,x17
713	cbnz	x27,Lsqr8x_reduction
714
715	ldp	x14,x15,[x2,#8*0]
716	ldp	x16,x17,[x2,#8*2]
717	mov	x0,x2
718	sub	x27,x3,x1	// done yet?
719	adds	x19,x19,x14
720	adcs	x20,x20,x15
721	ldp	x14,x15,[x2,#8*4]
722	adcs	x21,x21,x16
723	adcs	x22,x22,x17
724	ldp	x16,x17,[x2,#8*6]
725	adcs	x23,x23,x14
726	adcs	x24,x24,x15
727	adcs	x25,x25,x16
728	adcs	x26,x26,x17
729	//adc	x28,xzr,xzr		// moved below
730	cbz	x27,Lsqr8x8_post_condition
731
732	ldr	x4,[x2,#-8*8]
733	ldp	x6,x7,[x1,#8*0]
734	ldp	x8,x9,[x1,#8*2]
735	ldp	x10,x11,[x1,#8*4]
736	mov	x27,#-8*8
737	ldp	x12,x13,[x1,#8*6]
738	add	x1,x1,#8*8
739
740Lsqr8x_tail:
741	mul	x14,x6,x4
742	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
743	mul	x15,x7,x4
744	add	x27,x27,#8
745	mul	x16,x8,x4
746	mul	x17,x9,x4
747	adds	x19,x19,x14
748	mul	x14,x10,x4
749	adcs	x20,x20,x15
750	mul	x15,x11,x4
751	adcs	x21,x21,x16
752	mul	x16,x12,x4
753	adcs	x22,x22,x17
754	mul	x17,x13,x4
755	adcs	x23,x23,x14
756	umulh	x14,x6,x4
757	adcs	x24,x24,x15
758	umulh	x15,x7,x4
759	adcs	x25,x25,x16
760	umulh	x16,x8,x4
761	adcs	x26,x26,x17
762	umulh	x17,x9,x4
763	adc	x28,x28,xzr
764	str	x19,[x2],#8
765	adds	x19,x20,x14
766	umulh	x14,x10,x4
767	adcs	x20,x21,x15
768	umulh	x15,x11,x4
769	adcs	x21,x22,x16
770	umulh	x16,x12,x4
771	adcs	x22,x23,x17
772	umulh	x17,x13,x4
773	ldr	x4,[x0,x27]
774	adcs	x23,x24,x14
775	adcs	x24,x25,x15
776	adcs	x25,x26,x16
777	adcs	x26,x28,x17
778	//adc	x28,xzr,xzr		// moved above
779	cbnz	x27,Lsqr8x_tail
780					// note that carry flag is guaranteed
781					// to be zero at this point
782	ldp	x6,x7,[x2,#8*0]
783	sub	x27,x3,x1	// done yet?
784	sub	x16,x3,x5	// rewinded np
785	ldp	x8,x9,[x2,#8*2]
786	ldp	x10,x11,[x2,#8*4]
787	ldp	x12,x13,[x2,#8*6]
788	cbz	x27,Lsqr8x_tail_break
789
790	ldr	x4,[x0,#-8*8]
791	adds	x19,x19,x6
792	adcs	x20,x20,x7
793	ldp	x6,x7,[x1,#8*0]
794	adcs	x21,x21,x8
795	adcs	x22,x22,x9
796	ldp	x8,x9,[x1,#8*2]
797	adcs	x23,x23,x10
798	adcs	x24,x24,x11
799	ldp	x10,x11,[x1,#8*4]
800	adcs	x25,x25,x12
801	mov	x27,#-8*8
802	adcs	x26,x26,x13
803	ldp	x12,x13,[x1,#8*6]
804	add	x1,x1,#8*8
805	//adc	x28,xzr,xzr		// moved above
806	b	Lsqr8x_tail
807
808.align	4
809Lsqr8x_tail_break:
810	ldr	x4,[x29,#112]		// pull n0
811	add	x27,x2,#8*8		// end of current t[num] window
812
813	subs	xzr,x30,#1		// "move" top-most carry to carry bit
814	adcs	x14,x19,x6
815	adcs	x15,x20,x7
816	ldp	x19,x20,[x0,#8*0]
817	adcs	x21,x21,x8
818	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
819	adcs	x22,x22,x9
820	ldp	x8,x9,[x16,#8*2]
821	adcs	x23,x23,x10
822	adcs	x24,x24,x11
823	ldp	x10,x11,[x16,#8*4]
824	adcs	x25,x25,x12
825	adcs	x26,x26,x13
826	ldp	x12,x13,[x16,#8*6]
827	add	x1,x16,#8*8
828	adc	x30,xzr,xzr	// top-most carry
829	mul	x28,x4,x19
830	stp	x14,x15,[x2,#8*0]
831	stp	x21,x22,[x2,#8*2]
832	ldp	x21,x22,[x0,#8*2]
833	stp	x23,x24,[x2,#8*4]
834	ldp	x23,x24,[x0,#8*4]
835	cmp	x27,x29		// did we hit the bottom?
836	stp	x25,x26,[x2,#8*6]
837	mov	x2,x0			// slide the window
838	ldp	x25,x26,[x0,#8*6]
839	mov	x27,#8
840	b.ne	Lsqr8x_reduction
841
842	// Final step. We see if result is larger than modulus, and
843	// if it is, subtract the modulus. But comparison implies
844	// subtraction. So we subtract modulus, see if it borrowed,
845	// and conditionally copy original value.
846	ldr	x0,[x29,#96]		// pull rp
847	add	x2,x2,#8*8
848	subs	x14,x19,x6
849	sbcs	x15,x20,x7
850	sub	x27,x5,#8*8
851	mov	x3,x0		// x0 copy
852
853Lsqr8x_sub:
854	sbcs	x16,x21,x8
855	ldp	x6,x7,[x1,#8*0]
856	sbcs	x17,x22,x9
857	stp	x14,x15,[x0,#8*0]
858	sbcs	x14,x23,x10
859	ldp	x8,x9,[x1,#8*2]
860	sbcs	x15,x24,x11
861	stp	x16,x17,[x0,#8*2]
862	sbcs	x16,x25,x12
863	ldp	x10,x11,[x1,#8*4]
864	sbcs	x17,x26,x13
865	ldp	x12,x13,[x1,#8*6]
866	add	x1,x1,#8*8
867	ldp	x19,x20,[x2,#8*0]
868	sub	x27,x27,#8*8
869	ldp	x21,x22,[x2,#8*2]
870	ldp	x23,x24,[x2,#8*4]
871	ldp	x25,x26,[x2,#8*6]
872	add	x2,x2,#8*8
873	stp	x14,x15,[x0,#8*4]
874	sbcs	x14,x19,x6
875	stp	x16,x17,[x0,#8*6]
876	add	x0,x0,#8*8
877	sbcs	x15,x20,x7
878	cbnz	x27,Lsqr8x_sub
879
880	sbcs	x16,x21,x8
881	mov	x2,sp
882	add	x1,sp,x5
883	ldp	x6,x7,[x3,#8*0]
884	sbcs	x17,x22,x9
885	stp	x14,x15,[x0,#8*0]
886	sbcs	x14,x23,x10
887	ldp	x8,x9,[x3,#8*2]
888	sbcs	x15,x24,x11
889	stp	x16,x17,[x0,#8*2]
890	sbcs	x16,x25,x12
891	ldp	x19,x20,[x1,#8*0]
892	sbcs	x17,x26,x13
893	ldp	x21,x22,[x1,#8*2]
894	sbcs	xzr,x30,xzr	// did it borrow?
895	ldr	x30,[x29,#8]		// pull return address
896	stp	x14,x15,[x0,#8*4]
897	stp	x16,x17,[x0,#8*6]
898
899	sub	x27,x5,#8*4
900Lsqr4x_cond_copy:
901	sub	x27,x27,#8*4
902	csel	x14,x19,x6,lo
903	stp	xzr,xzr,[x2,#8*0]
904	csel	x15,x20,x7,lo
905	ldp	x6,x7,[x3,#8*4]
906	ldp	x19,x20,[x1,#8*4]
907	csel	x16,x21,x8,lo
908	stp	xzr,xzr,[x2,#8*2]
909	add	x2,x2,#8*4
910	csel	x17,x22,x9,lo
911	ldp	x8,x9,[x3,#8*6]
912	ldp	x21,x22,[x1,#8*6]
913	add	x1,x1,#8*4
914	stp	x14,x15,[x3,#8*0]
915	stp	x16,x17,[x3,#8*2]
916	add	x3,x3,#8*4
917	stp	xzr,xzr,[x1,#8*0]
918	stp	xzr,xzr,[x1,#8*2]
919	cbnz	x27,Lsqr4x_cond_copy
920
921	csel	x14,x19,x6,lo
922	stp	xzr,xzr,[x2,#8*0]
923	csel	x15,x20,x7,lo
924	stp	xzr,xzr,[x2,#8*2]
925	csel	x16,x21,x8,lo
926	csel	x17,x22,x9,lo
927	stp	x14,x15,[x3,#8*0]
928	stp	x16,x17,[x3,#8*2]
929
930	b	Lsqr8x_done
931
932.align	4
933Lsqr8x8_post_condition:
934	adc	x28,xzr,xzr
935	ldr	x30,[x29,#8]		// pull return address
936	// x19-7,x28 hold result, x6-7 hold modulus
937	subs	x6,x19,x6
938	ldr	x1,[x29,#96]		// pull rp
939	sbcs	x7,x20,x7
940	stp	xzr,xzr,[sp,#8*0]
941	sbcs	x8,x21,x8
942	stp	xzr,xzr,[sp,#8*2]
943	sbcs	x9,x22,x9
944	stp	xzr,xzr,[sp,#8*4]
945	sbcs	x10,x23,x10
946	stp	xzr,xzr,[sp,#8*6]
947	sbcs	x11,x24,x11
948	stp	xzr,xzr,[sp,#8*8]
949	sbcs	x12,x25,x12
950	stp	xzr,xzr,[sp,#8*10]
951	sbcs	x13,x26,x13
952	stp	xzr,xzr,[sp,#8*12]
953	sbcs	x28,x28,xzr	// did it borrow?
954	stp	xzr,xzr,[sp,#8*14]
955
956	// x6-7 hold result-modulus
957	csel	x6,x19,x6,lo
958	csel	x7,x20,x7,lo
959	csel	x8,x21,x8,lo
960	csel	x9,x22,x9,lo
961	stp	x6,x7,[x1,#8*0]
962	csel	x10,x23,x10,lo
963	csel	x11,x24,x11,lo
964	stp	x8,x9,[x1,#8*2]
965	csel	x12,x25,x12,lo
966	csel	x13,x26,x13,lo
967	stp	x10,x11,[x1,#8*4]
968	stp	x12,x13,[x1,#8*6]
969
970Lsqr8x_done:
971	ldp	x19,x20,[x29,#16]
972	mov	sp,x29
973	ldp	x21,x22,[x29,#32]
974	mov	x0,#1
975	ldp	x23,x24,[x29,#48]
976	ldp	x25,x26,[x29,#64]
977	ldp	x27,x28,[x29,#80]
978	ldr	x29,[sp],#128
979	// x30 is popped earlier
980	AARCH64_VALIDATE_LINK_REGISTER
981	ret
982
983.def __bn_mul4x_mont
984   .type 32
985.endef
986.align	5
987__bn_mul4x_mont:
988	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
989	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
990	// return address.
991	stp	x29,x30,[sp,#-128]!
992	add	x29,sp,#0
993	stp	x19,x20,[sp,#16]
994	stp	x21,x22,[sp,#32]
995	stp	x23,x24,[sp,#48]
996	stp	x25,x26,[sp,#64]
997	stp	x27,x28,[sp,#80]
998
999	sub	x26,sp,x5,lsl#3
1000	lsl	x5,x5,#3
1001	ldr	x4,[x4]		// *n0
1002	sub	sp,x26,#8*4		// alloca
1003
1004	add	x10,x2,x5
1005	add	x27,x1,x5
1006	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1007
1008	ldr	x24,[x2,#8*0]		// b[0]
1009	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1010	ldp	x8,x9,[x1,#8*2]
1011	add	x1,x1,#8*4
1012	mov	x19,xzr
1013	mov	x20,xzr
1014	mov	x21,xzr
1015	mov	x22,xzr
1016	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1017	ldp	x16,x17,[x3,#8*2]
1018	adds	x3,x3,#8*4		// clear carry bit
1019	mov	x0,xzr
1020	mov	x28,#0
1021	mov	x26,sp
1022
1023Loop_mul4x_1st_reduction:
1024	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1025	adc	x0,x0,xzr	// modulo-scheduled
1026	mul	x11,x7,x24
1027	add	x28,x28,#8
1028	mul	x12,x8,x24
1029	and	x28,x28,#31
1030	mul	x13,x9,x24
1031	adds	x19,x19,x10
1032	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1033	adcs	x20,x20,x11
1034	mul	x25,x19,x4		// t[0]*n0
1035	adcs	x21,x21,x12
1036	umulh	x11,x7,x24
1037	adcs	x22,x22,x13
1038	umulh	x12,x8,x24
1039	adc	x23,xzr,xzr
1040	umulh	x13,x9,x24
1041	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1042	adds	x20,x20,x10
1043	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1044	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1045	adcs	x21,x21,x11
1046	mul	x11,x15,x25
1047	adcs	x22,x22,x12
1048	mul	x12,x16,x25
1049	adc	x23,x23,x13		// can't overflow
1050	mul	x13,x17,x25
1051	// (*)	adds	xzr,x19,x10
1052	subs	xzr,x19,#1		// (*)
1053	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1054	adcs	x19,x20,x11
1055	umulh	x11,x15,x25
1056	adcs	x20,x21,x12
1057	umulh	x12,x16,x25
1058	adcs	x21,x22,x13
1059	umulh	x13,x17,x25
1060	adcs	x22,x23,x0
1061	adc	x0,xzr,xzr
1062	adds	x19,x19,x10
1063	sub	x10,x27,x1
1064	adcs	x20,x20,x11
1065	adcs	x21,x21,x12
1066	adcs	x22,x22,x13
1067	//adc	x0,x0,xzr
1068	cbnz	x28,Loop_mul4x_1st_reduction
1069
1070	cbz	x10,Lmul4x4_post_condition
1071
1072	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1073	ldp	x8,x9,[x1,#8*2]
1074	add	x1,x1,#8*4
1075	ldr	x25,[sp]		// a[0]*n0
1076	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1077	ldp	x16,x17,[x3,#8*2]
1078	add	x3,x3,#8*4
1079
1080Loop_mul4x_1st_tail:
1081	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1082	adc	x0,x0,xzr	// modulo-scheduled
1083	mul	x11,x7,x24
1084	add	x28,x28,#8
1085	mul	x12,x8,x24
1086	and	x28,x28,#31
1087	mul	x13,x9,x24
1088	adds	x19,x19,x10
1089	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1090	adcs	x20,x20,x11
1091	umulh	x11,x7,x24
1092	adcs	x21,x21,x12
1093	umulh	x12,x8,x24
1094	adcs	x22,x22,x13
1095	umulh	x13,x9,x24
1096	adc	x23,xzr,xzr
1097	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1098	adds	x20,x20,x10
1099	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1100	adcs	x21,x21,x11
1101	mul	x11,x15,x25
1102	adcs	x22,x22,x12
1103	mul	x12,x16,x25
1104	adc	x23,x23,x13		// can't overflow
1105	mul	x13,x17,x25
1106	adds	x19,x19,x10
1107	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1108	adcs	x20,x20,x11
1109	umulh	x11,x15,x25
1110	adcs	x21,x21,x12
1111	umulh	x12,x16,x25
1112	adcs	x22,x22,x13
1113	adcs	x23,x23,x0
1114	umulh	x13,x17,x25
1115	adc	x0,xzr,xzr
1116	ldr	x25,[sp,x28]		// next t[0]*n0
1117	str	x19,[x26],#8		// result!!!
1118	adds	x19,x20,x10
1119	sub	x10,x27,x1		// done yet?
1120	adcs	x20,x21,x11
1121	adcs	x21,x22,x12
1122	adcs	x22,x23,x13
1123	//adc	x0,x0,xzr
1124	cbnz	x28,Loop_mul4x_1st_tail
1125
1126	sub	x11,x27,x5	// rewinded x1
1127	cbz	x10,Lmul4x_proceed
1128
1129	ldp	x6,x7,[x1,#8*0]
1130	ldp	x8,x9,[x1,#8*2]
1131	add	x1,x1,#8*4
1132	ldp	x14,x15,[x3,#8*0]
1133	ldp	x16,x17,[x3,#8*2]
1134	add	x3,x3,#8*4
1135	b	Loop_mul4x_1st_tail
1136
1137.align	5
1138Lmul4x_proceed:
1139	ldr	x24,[x2,#8*4]!		// *++b
1140	adc	x30,x0,xzr
1141	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1142	sub	x3,x3,x5		// rewind np
1143	ldp	x8,x9,[x11,#8*2]
1144	add	x1,x11,#8*4
1145
1146	stp	x19,x20,[x26,#8*0]	// result!!!
1147	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1148	stp	x21,x22,[x26,#8*2]	// result!!!
1149	ldp	x21,x22,[sp,#8*6]
1150
1151	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1152	mov	x26,sp
1153	ldp	x16,x17,[x3,#8*2]
1154	adds	x3,x3,#8*4		// clear carry bit
1155	mov	x0,xzr
1156
1157.align	4
1158Loop_mul4x_reduction:
1159	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1160	adc	x0,x0,xzr	// modulo-scheduled
1161	mul	x11,x7,x24
1162	add	x28,x28,#8
1163	mul	x12,x8,x24
1164	and	x28,x28,#31
1165	mul	x13,x9,x24
1166	adds	x19,x19,x10
1167	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1168	adcs	x20,x20,x11
1169	mul	x25,x19,x4		// t[0]*n0
1170	adcs	x21,x21,x12
1171	umulh	x11,x7,x24
1172	adcs	x22,x22,x13
1173	umulh	x12,x8,x24
1174	adc	x23,xzr,xzr
1175	umulh	x13,x9,x24
1176	ldr	x24,[x2,x28]		// next b[i]
1177	adds	x20,x20,x10
1178	// (*)	mul	x10,x14,x25
1179	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1180	adcs	x21,x21,x11
1181	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1182	adcs	x22,x22,x12
1183	mul	x12,x16,x25
1184	adc	x23,x23,x13		// can't overflow
1185	mul	x13,x17,x25
1186	// (*)	adds	xzr,x19,x10
1187	subs	xzr,x19,#1		// (*)
1188	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1189	adcs	x19,x20,x11
1190	umulh	x11,x15,x25
1191	adcs	x20,x21,x12
1192	umulh	x12,x16,x25
1193	adcs	x21,x22,x13
1194	umulh	x13,x17,x25
1195	adcs	x22,x23,x0
1196	adc	x0,xzr,xzr
1197	adds	x19,x19,x10
1198	adcs	x20,x20,x11
1199	adcs	x21,x21,x12
1200	adcs	x22,x22,x13
1201	//adc	x0,x0,xzr
1202	cbnz	x28,Loop_mul4x_reduction
1203
1204	adc	x0,x0,xzr
1205	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1206	ldp	x12,x13,[x26,#8*6]
1207	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1208	ldp	x8,x9,[x1,#8*2]
1209	add	x1,x1,#8*4
1210	adds	x19,x19,x10
1211	adcs	x20,x20,x11
1212	adcs	x21,x21,x12
1213	adcs	x22,x22,x13
1214	//adc	x0,x0,xzr
1215
1216	ldr	x25,[sp]		// t[0]*n0
1217	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1218	ldp	x16,x17,[x3,#8*2]
1219	add	x3,x3,#8*4
1220
1221.align	4
1222Loop_mul4x_tail:
1223	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1224	adc	x0,x0,xzr	// modulo-scheduled
1225	mul	x11,x7,x24
1226	add	x28,x28,#8
1227	mul	x12,x8,x24
1228	and	x28,x28,#31
1229	mul	x13,x9,x24
1230	adds	x19,x19,x10
1231	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1232	adcs	x20,x20,x11
1233	umulh	x11,x7,x24
1234	adcs	x21,x21,x12
1235	umulh	x12,x8,x24
1236	adcs	x22,x22,x13
1237	umulh	x13,x9,x24
1238	adc	x23,xzr,xzr
1239	ldr	x24,[x2,x28]		// next b[i]
1240	adds	x20,x20,x10
1241	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1242	adcs	x21,x21,x11
1243	mul	x11,x15,x25
1244	adcs	x22,x22,x12
1245	mul	x12,x16,x25
1246	adc	x23,x23,x13		// can't overflow
1247	mul	x13,x17,x25
1248	adds	x19,x19,x10
1249	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1250	adcs	x20,x20,x11
1251	umulh	x11,x15,x25
1252	adcs	x21,x21,x12
1253	umulh	x12,x16,x25
1254	adcs	x22,x22,x13
1255	umulh	x13,x17,x25
1256	adcs	x23,x23,x0
1257	ldr	x25,[sp,x28]		// next a[0]*n0
1258	adc	x0,xzr,xzr
1259	str	x19,[x26],#8		// result!!!
1260	adds	x19,x20,x10
1261	sub	x10,x27,x1		// done yet?
1262	adcs	x20,x21,x11
1263	adcs	x21,x22,x12
1264	adcs	x22,x23,x13
1265	//adc	x0,x0,xzr
1266	cbnz	x28,Loop_mul4x_tail
1267
1268	sub	x11,x3,x5		// rewinded np?
1269	adc	x0,x0,xzr
1270	cbz	x10,Loop_mul4x_break
1271
1272	ldp	x10,x11,[x26,#8*4]
1273	ldp	x12,x13,[x26,#8*6]
1274	ldp	x6,x7,[x1,#8*0]
1275	ldp	x8,x9,[x1,#8*2]
1276	add	x1,x1,#8*4
1277	adds	x19,x19,x10
1278	adcs	x20,x20,x11
1279	adcs	x21,x21,x12
1280	adcs	x22,x22,x13
1281	//adc	x0,x0,xzr
1282	ldp	x14,x15,[x3,#8*0]
1283	ldp	x16,x17,[x3,#8*2]
1284	add	x3,x3,#8*4
1285	b	Loop_mul4x_tail
1286
1287.align	4
1288Loop_mul4x_break:
1289	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1290	adds	x19,x19,x30
1291	add	x2,x2,#8*4		// bp++
1292	adcs	x20,x20,xzr
1293	sub	x1,x1,x5		// rewind ap
1294	adcs	x21,x21,xzr
1295	stp	x19,x20,[x26,#8*0]	// result!!!
1296	adcs	x22,x22,xzr
1297	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1298	adc	x30,x0,xzr
1299	stp	x21,x22,[x26,#8*2]	// result!!!
1300	cmp	x2,x13			// done yet?
1301	ldp	x21,x22,[sp,#8*6]
1302	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1303	ldp	x16,x17,[x11,#8*2]
1304	add	x3,x11,#8*4
1305	b.eq	Lmul4x_post
1306
1307	ldr	x24,[x2]
1308	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1309	ldp	x8,x9,[x1,#8*2]
1310	adds	x1,x1,#8*4		// clear carry bit
1311	mov	x0,xzr
1312	mov	x26,sp
1313	b	Loop_mul4x_reduction
1314
1315.align	4
1316Lmul4x_post:
1317	// Final step. We see if result is larger than modulus, and
1318	// if it is, subtract the modulus. But comparison implies
1319	// subtraction. So we subtract modulus, see if it borrowed,
1320	// and conditionally copy original value.
1321	mov	x0,x12
1322	mov	x27,x12		// x0 copy
1323	subs	x10,x19,x14
1324	add	x26,sp,#8*8
1325	sbcs	x11,x20,x15
1326	sub	x28,x5,#8*4
1327
1328Lmul4x_sub:
1329	sbcs	x12,x21,x16
1330	ldp	x14,x15,[x3,#8*0]
1331	sub	x28,x28,#8*4
1332	ldp	x19,x20,[x26,#8*0]
1333	sbcs	x13,x22,x17
1334	ldp	x16,x17,[x3,#8*2]
1335	add	x3,x3,#8*4
1336	ldp	x21,x22,[x26,#8*2]
1337	add	x26,x26,#8*4
1338	stp	x10,x11,[x0,#8*0]
1339	sbcs	x10,x19,x14
1340	stp	x12,x13,[x0,#8*2]
1341	add	x0,x0,#8*4
1342	sbcs	x11,x20,x15
1343	cbnz	x28,Lmul4x_sub
1344
1345	sbcs	x12,x21,x16
1346	mov	x26,sp
1347	add	x1,sp,#8*4
1348	ldp	x6,x7,[x27,#8*0]
1349	sbcs	x13,x22,x17
1350	stp	x10,x11,[x0,#8*0]
1351	ldp	x8,x9,[x27,#8*2]
1352	stp	x12,x13,[x0,#8*2]
1353	ldp	x19,x20,[x1,#8*0]
1354	ldp	x21,x22,[x1,#8*2]
1355	sbcs	xzr,x30,xzr	// did it borrow?
1356	ldr	x30,[x29,#8]		// pull return address
1357
1358	sub	x28,x5,#8*4
1359Lmul4x_cond_copy:
1360	sub	x28,x28,#8*4
1361	csel	x10,x19,x6,lo
1362	stp	xzr,xzr,[x26,#8*0]
1363	csel	x11,x20,x7,lo
1364	ldp	x6,x7,[x27,#8*4]
1365	ldp	x19,x20,[x1,#8*4]
1366	csel	x12,x21,x8,lo
1367	stp	xzr,xzr,[x26,#8*2]
1368	add	x26,x26,#8*4
1369	csel	x13,x22,x9,lo
1370	ldp	x8,x9,[x27,#8*6]
1371	ldp	x21,x22,[x1,#8*6]
1372	add	x1,x1,#8*4
1373	stp	x10,x11,[x27,#8*0]
1374	stp	x12,x13,[x27,#8*2]
1375	add	x27,x27,#8*4
1376	cbnz	x28,Lmul4x_cond_copy
1377
1378	csel	x10,x19,x6,lo
1379	stp	xzr,xzr,[x26,#8*0]
1380	csel	x11,x20,x7,lo
1381	stp	xzr,xzr,[x26,#8*2]
1382	csel	x12,x21,x8,lo
1383	stp	xzr,xzr,[x26,#8*3]
1384	csel	x13,x22,x9,lo
1385	stp	xzr,xzr,[x26,#8*4]
1386	stp	x10,x11,[x27,#8*0]
1387	stp	x12,x13,[x27,#8*2]
1388
1389	b	Lmul4x_done
1390
1391.align	4
1392Lmul4x4_post_condition:
1393	adc	x0,x0,xzr
1394	ldr	x1,[x29,#96]		// pull rp
1395	// x19-3,x0 hold result, x14-7 hold modulus
1396	subs	x6,x19,x14
1397	ldr	x30,[x29,#8]		// pull return address
1398	sbcs	x7,x20,x15
1399	stp	xzr,xzr,[sp,#8*0]
1400	sbcs	x8,x21,x16
1401	stp	xzr,xzr,[sp,#8*2]
1402	sbcs	x9,x22,x17
1403	stp	xzr,xzr,[sp,#8*4]
1404	sbcs	xzr,x0,xzr		// did it borrow?
1405	stp	xzr,xzr,[sp,#8*6]
1406
1407	// x6-3 hold result-modulus
1408	csel	x6,x19,x6,lo
1409	csel	x7,x20,x7,lo
1410	csel	x8,x21,x8,lo
1411	csel	x9,x22,x9,lo
1412	stp	x6,x7,[x1,#8*0]
1413	stp	x8,x9,[x1,#8*2]
1414
1415Lmul4x_done:
1416	ldp	x19,x20,[x29,#16]
1417	mov	sp,x29
1418	ldp	x21,x22,[x29,#32]
1419	mov	x0,#1
1420	ldp	x23,x24,[x29,#48]
1421	ldp	x25,x26,[x29,#64]
1422	ldp	x27,x28,[x29,#80]
1423	ldr	x29,[sp],#128
1424	// x30 is popped earlier
1425	AARCH64_VALIDATE_LINK_REGISTER
1426	ret
1427
1428.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1429.align	2
1430.align	4
1431#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1432