1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7// This file contains constant-time, 64-bit assembly implementation of
8// P256. The optimizations performed here are described in detail in:
9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
10//                          256-bit primes"
11// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
12// https://eprint.iacr.org/2013/816.pdf
13
14#include "textflag.h"
15
16#define res_ptr R0
17#define a_ptr R1
18#define b_ptr R2
19
20#define acc0 R3
21#define acc1 R4
22#define acc2 R5
23#define acc3 R6
24
25#define acc4 R7
26#define acc5 R8
27#define acc6 R9
28#define acc7 R10
29#define t0 R11
30#define t1 R12
31#define t2 R13
32#define t3 R14
33#define const0 R15
34#define const1 R16
35
36#define hlp0 R17
37#define hlp1 res_ptr
38
39#define x0 R19
40#define x1 R20
41#define x2 R21
42#define x3 R22
43#define y0 R23
44#define y1 R24
45#define y2 R25
46#define y3 R26
47
48#define const2 t2
49#define const3 t3
50
51DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
52DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
53DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
54DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
55DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
56DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
57DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
58DATA p256one<>+0x00(SB)/8, $0x0000000000000001
59DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
60DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
61DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
62GLOBL p256const0<>(SB), 8, $8
63GLOBL p256const1<>(SB), 8, $8
64GLOBL p256ordK0<>(SB), 8, $8
65GLOBL p256ord<>(SB), 8, $32
66GLOBL p256one<>(SB), 8, $32
67
68/* ---------------------------------------*/
69// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
70TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
71	JMP	·p256BigToLittle(SB)
72/* ---------------------------------------*/
73// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
74TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
75	JMP	·p256BigToLittle(SB)
76/* ---------------------------------------*/
77// func p256LittleToBig(res *[32]byte, in *p256Element)
78TEXT ·p256LittleToBig(SB),NOSPLIT,$0
79	JMP	·p256BigToLittle(SB)
80/* ---------------------------------------*/
81// func p256BigToLittle(res *p256Element, in *[32]byte)
82TEXT ·p256BigToLittle(SB),NOSPLIT,$0
83	MOVD	res+0(FP), res_ptr
84	MOVD	in+8(FP), a_ptr
85
86	LDP	0*16(a_ptr), (acc0, acc1)
87	LDP	1*16(a_ptr), (acc2, acc3)
88
89	REV	acc0, acc0
90	REV	acc1, acc1
91	REV	acc2, acc2
92	REV	acc3, acc3
93
94	STP	(acc3, acc2), 0*16(res_ptr)
95	STP	(acc1, acc0), 1*16(res_ptr)
96	RET
97/* ---------------------------------------*/
98// func p256MovCond(res, a, b *P256Point, cond int)
99// If cond == 0 res=b, else res=a
100TEXT ·p256MovCond(SB),NOSPLIT,$0
101	MOVD	res+0(FP), res_ptr
102	MOVD	a+8(FP), a_ptr
103	MOVD	b+16(FP), b_ptr
104	MOVD	cond+24(FP), R3
105
106	CMP	$0, R3
107	// Two remarks:
108	// 1) Will want to revisit NEON, when support is better
109	// 2) CSEL might not be constant time on all ARM processors
110	LDP	0*16(a_ptr), (R4, R5)
111	LDP	1*16(a_ptr), (R6, R7)
112	LDP	2*16(a_ptr), (R8, R9)
113	LDP	0*16(b_ptr), (R16, R17)
114	LDP	1*16(b_ptr), (R19, R20)
115	LDP	2*16(b_ptr), (R21, R22)
116	CSEL	EQ, R16, R4, R4
117	CSEL	EQ, R17, R5, R5
118	CSEL	EQ, R19, R6, R6
119	CSEL	EQ, R20, R7, R7
120	CSEL	EQ, R21, R8, R8
121	CSEL	EQ, R22, R9, R9
122	STP	(R4, R5), 0*16(res_ptr)
123	STP	(R6, R7), 1*16(res_ptr)
124	STP	(R8, R9), 2*16(res_ptr)
125
126	LDP	3*16(a_ptr), (R4, R5)
127	LDP	4*16(a_ptr), (R6, R7)
128	LDP	5*16(a_ptr), (R8, R9)
129	LDP	3*16(b_ptr), (R16, R17)
130	LDP	4*16(b_ptr), (R19, R20)
131	LDP	5*16(b_ptr), (R21, R22)
132	CSEL	EQ, R16, R4, R4
133	CSEL	EQ, R17, R5, R5
134	CSEL	EQ, R19, R6, R6
135	CSEL	EQ, R20, R7, R7
136	CSEL	EQ, R21, R8, R8
137	CSEL	EQ, R22, R9, R9
138	STP	(R4, R5), 3*16(res_ptr)
139	STP	(R6, R7), 4*16(res_ptr)
140	STP	(R8, R9), 5*16(res_ptr)
141
142	RET
143/* ---------------------------------------*/
144// func p256NegCond(val *p256Element, cond int)
145TEXT ·p256NegCond(SB),NOSPLIT,$0
146	MOVD	val+0(FP), a_ptr
147	MOVD	cond+8(FP), hlp0
148	MOVD	a_ptr, res_ptr
149	// acc = poly
150	MOVD	$-1, acc0
151	MOVD	p256const0<>(SB), acc1
152	MOVD	$0, acc2
153	MOVD	p256const1<>(SB), acc3
154	// Load the original value
155	LDP	0*16(a_ptr), (t0, t1)
156	LDP	1*16(a_ptr), (t2, t3)
157	// Speculatively subtract
158	SUBS	t0, acc0
159	SBCS	t1, acc1
160	SBCS	t2, acc2
161	SBC	t3, acc3
162	// If condition is 0, keep original value
163	CMP	$0, hlp0
164	CSEL	EQ, t0, acc0, acc0
165	CSEL	EQ, t1, acc1, acc1
166	CSEL	EQ, t2, acc2, acc2
167	CSEL	EQ, t3, acc3, acc3
168	// Store result
169	STP	(acc0, acc1), 0*16(res_ptr)
170	STP	(acc2, acc3), 1*16(res_ptr)
171
172	RET
173/* ---------------------------------------*/
174// func p256Sqr(res, in *p256Element, n int)
175TEXT ·p256Sqr(SB),NOSPLIT,$0
176	MOVD	res+0(FP), res_ptr
177	MOVD	in+8(FP), a_ptr
178	MOVD	n+16(FP), b_ptr
179
180	MOVD	p256const0<>(SB), const0
181	MOVD	p256const1<>(SB), const1
182
183	LDP	0*16(a_ptr), (x0, x1)
184	LDP	1*16(a_ptr), (x2, x3)
185
186sqrLoop:
187	SUB	$1, b_ptr
188	CALL	p256SqrInternal<>(SB)
189	MOVD	y0, x0
190	MOVD	y1, x1
191	MOVD	y2, x2
192	MOVD	y3, x3
193	CBNZ	b_ptr, sqrLoop
194
195	STP	(y0, y1), 0*16(res_ptr)
196	STP	(y2, y3), 1*16(res_ptr)
197	RET
198/* ---------------------------------------*/
199// func p256Mul(res, in1, in2 *p256Element)
200TEXT ·p256Mul(SB),NOSPLIT,$0
201	MOVD	res+0(FP), res_ptr
202	MOVD	in1+8(FP), a_ptr
203	MOVD	in2+16(FP), b_ptr
204
205	MOVD	p256const0<>(SB), const0
206	MOVD	p256const1<>(SB), const1
207
208	LDP	0*16(a_ptr), (x0, x1)
209	LDP	1*16(a_ptr), (x2, x3)
210
211	LDP	0*16(b_ptr), (y0, y1)
212	LDP	1*16(b_ptr), (y2, y3)
213
214	CALL	p256MulInternal<>(SB)
215
216	STP	(y0, y1), 0*16(res_ptr)
217	STP	(y2, y3), 1*16(res_ptr)
218	RET
219/* ---------------------------------------*/
220// func p256FromMont(res, in *p256Element)
221TEXT ·p256FromMont(SB),NOSPLIT,$0
222	MOVD	res+0(FP), res_ptr
223	MOVD	in+8(FP), a_ptr
224
225	MOVD	p256const0<>(SB), const0
226	MOVD	p256const1<>(SB), const1
227
228	LDP	0*16(a_ptr), (acc0, acc1)
229	LDP	1*16(a_ptr), (acc2, acc3)
230	// Only reduce, no multiplications are needed
231	// First reduction step
232	ADDS	acc0<<32, acc1, acc1
233	LSR	$32, acc0, t0
234	MUL	acc0, const1, t1
235	UMULH	acc0, const1, acc0
236	ADCS	t0, acc2
237	ADCS	t1, acc3
238	ADC	$0, acc0
239	// Second reduction step
240	ADDS	acc1<<32, acc2, acc2
241	LSR	$32, acc1, t0
242	MUL	acc1, const1, t1
243	UMULH	acc1, const1, acc1
244	ADCS	t0, acc3
245	ADCS	t1, acc0
246	ADC	$0, acc1
247	// Third reduction step
248	ADDS	acc2<<32, acc3, acc3
249	LSR	$32, acc2, t0
250	MUL	acc2, const1, t1
251	UMULH	acc2, const1, acc2
252	ADCS	t0, acc0
253	ADCS	t1, acc1
254	ADC	$0, acc2
255	// Last reduction step
256	ADDS	acc3<<32, acc0, acc0
257	LSR	$32, acc3, t0
258	MUL	acc3, const1, t1
259	UMULH	acc3, const1, acc3
260	ADCS	t0, acc1
261	ADCS	t1, acc2
262	ADC	$0, acc3
263
264	SUBS	$-1, acc0, t0
265	SBCS	const0, acc1, t1
266	SBCS	$0, acc2, t2
267	SBCS	const1, acc3, t3
268
269	CSEL	CS, t0, acc0, acc0
270	CSEL	CS, t1, acc1, acc1
271	CSEL	CS, t2, acc2, acc2
272	CSEL	CS, t3, acc3, acc3
273
274	STP	(acc0, acc1), 0*16(res_ptr)
275	STP	(acc2, acc3), 1*16(res_ptr)
276
277	RET
278/* ---------------------------------------*/
279// func p256Select(res *P256Point, table *p256Table, idx int)
280TEXT ·p256Select(SB),NOSPLIT,$0
281	MOVD	idx+16(FP), const0
282	MOVD	table+8(FP), b_ptr
283	MOVD	res+0(FP), res_ptr
284
285	EOR	x0, x0, x0
286	EOR	x1, x1, x1
287	EOR	x2, x2, x2
288	EOR	x3, x3, x3
289	EOR	y0, y0, y0
290	EOR	y1, y1, y1
291	EOR	y2, y2, y2
292	EOR	y3, y3, y3
293	EOR	t0, t0, t0
294	EOR	t1, t1, t1
295	EOR	t2, t2, t2
296	EOR	t3, t3, t3
297
298	MOVD	$0, const1
299
300loop_select:
301		ADD	$1, const1
302		CMP	const0, const1
303		LDP.P	16(b_ptr), (acc0, acc1)
304		CSEL	EQ, acc0, x0, x0
305		CSEL	EQ, acc1, x1, x1
306		LDP.P	16(b_ptr), (acc2, acc3)
307		CSEL	EQ, acc2, x2, x2
308		CSEL	EQ, acc3, x3, x3
309		LDP.P	16(b_ptr), (acc4, acc5)
310		CSEL	EQ, acc4, y0, y0
311		CSEL	EQ, acc5, y1, y1
312		LDP.P	16(b_ptr), (acc6, acc7)
313		CSEL	EQ, acc6, y2, y2
314		CSEL	EQ, acc7, y3, y3
315		LDP.P	16(b_ptr), (acc0, acc1)
316		CSEL	EQ, acc0, t0, t0
317		CSEL	EQ, acc1, t1, t1
318		LDP.P	16(b_ptr), (acc2, acc3)
319		CSEL	EQ, acc2, t2, t2
320		CSEL	EQ, acc3, t3, t3
321
322		CMP	$16, const1
323		BNE	loop_select
324
325	STP	(x0, x1), 0*16(res_ptr)
326	STP	(x2, x3), 1*16(res_ptr)
327	STP	(y0, y1), 2*16(res_ptr)
328	STP	(y2, y3), 3*16(res_ptr)
329	STP	(t0, t1), 4*16(res_ptr)
330	STP	(t2, t3), 5*16(res_ptr)
331	RET
332/* ---------------------------------------*/
333// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
334TEXT ·p256SelectAffine(SB),NOSPLIT,$0
335	MOVD	idx+16(FP), t0
336	MOVD	table+8(FP), t1
337	MOVD	res+0(FP), res_ptr
338
339	EOR	x0, x0, x0
340	EOR	x1, x1, x1
341	EOR	x2, x2, x2
342	EOR	x3, x3, x3
343	EOR	y0, y0, y0
344	EOR	y1, y1, y1
345	EOR	y2, y2, y2
346	EOR	y3, y3, y3
347
348	MOVD	$0, t2
349
350loop_select:
351		ADD	$1, t2
352		CMP	t0, t2
353		LDP.P	16(t1), (acc0, acc1)
354		CSEL	EQ, acc0, x0, x0
355		CSEL	EQ, acc1, x1, x1
356		LDP.P	16(t1), (acc2, acc3)
357		CSEL	EQ, acc2, x2, x2
358		CSEL	EQ, acc3, x3, x3
359		LDP.P	16(t1), (acc4, acc5)
360		CSEL	EQ, acc4, y0, y0
361		CSEL	EQ, acc5, y1, y1
362		LDP.P	16(t1), (acc6, acc7)
363		CSEL	EQ, acc6, y2, y2
364		CSEL	EQ, acc7, y3, y3
365
366		CMP	$32, t2
367		BNE	loop_select
368
369	STP	(x0, x1), 0*16(res_ptr)
370	STP	(x2, x3), 1*16(res_ptr)
371	STP	(y0, y1), 2*16(res_ptr)
372	STP	(y2, y3), 3*16(res_ptr)
373	RET
374/* ---------------------------------------*/
375// func p256OrdSqr(res, in *p256OrdElement, n int)
376TEXT ·p256OrdSqr(SB),NOSPLIT,$0
377	MOVD	in+8(FP), a_ptr
378	MOVD	n+16(FP), b_ptr
379
380	MOVD	p256ordK0<>(SB), hlp1
381	LDP	p256ord<>+0x00(SB), (const0, const1)
382	LDP	p256ord<>+0x10(SB), (const2, const3)
383
384	LDP	0*16(a_ptr), (x0, x1)
385	LDP	1*16(a_ptr), (x2, x3)
386
387ordSqrLoop:
388	SUB	$1, b_ptr
389
390	// x[1:] * x[0]
391	MUL	x0, x1, acc1
392	UMULH	x0, x1, acc2
393
394	MUL	x0, x2, t0
395	ADDS	t0, acc2, acc2
396	UMULH	x0, x2, acc3
397
398	MUL	x0, x3, t0
399	ADCS	t0, acc3, acc3
400	UMULH	x0, x3, acc4
401	ADC	$0, acc4, acc4
402	// x[2:] * x[1]
403	MUL	x1, x2, t0
404	ADDS	t0, acc3
405	UMULH	x1, x2, t1
406	ADCS	t1, acc4
407	ADC	$0, ZR, acc5
408
409	MUL	x1, x3, t0
410	ADDS	t0, acc4
411	UMULH	x1, x3, t1
412	ADC	t1, acc5
413	// x[3] * x[2]
414	MUL	x2, x3, t0
415	ADDS	t0, acc5
416	UMULH	x2, x3, acc6
417	ADC	$0, acc6
418
419	MOVD	$0, acc7
420	// *2
421	ADDS	acc1, acc1
422	ADCS	acc2, acc2
423	ADCS	acc3, acc3
424	ADCS	acc4, acc4
425	ADCS	acc5, acc5
426	ADCS	acc6, acc6
427	ADC	$0, acc7
428	// Missing products
429	MUL	x0, x0, acc0
430	UMULH	x0, x0, t0
431	ADDS	t0, acc1, acc1
432
433	MUL	x1, x1, t0
434	ADCS	t0, acc2, acc2
435	UMULH	x1, x1, t1
436	ADCS	t1, acc3, acc3
437
438	MUL	x2, x2, t0
439	ADCS	t0, acc4, acc4
440	UMULH	x2, x2, t1
441	ADCS	t1, acc5, acc5
442
443	MUL	x3, x3, t0
444	ADCS	t0, acc6, acc6
445	UMULH	x3, x3, t1
446	ADC	t1, acc7, acc7
447	// First reduction step
448	MUL	acc0, hlp1, hlp0
449
450	MUL	const0, hlp1, t0
451	ADDS	t0, acc0, acc0
452	UMULH	const0, hlp0, t1
453
454	MUL	const1, hlp0, t0
455	ADCS	t0, acc1, acc1
456	UMULH	const1, hlp0, y0
457
458	MUL	const2, hlp0, t0
459	ADCS	t0, acc2, acc2
460	UMULH	const2, hlp0, acc0
461
462	MUL	const3, hlp0, t0
463	ADCS	t0, acc3, acc3
464
465	UMULH	const3, hlp0, hlp0
466	ADC	$0, hlp0
467
468	ADDS	t1, acc1, acc1
469	ADCS	y0, acc2, acc2
470	ADCS	acc0, acc3, acc3
471	ADC	$0, hlp0, acc0
472	// Second reduction step
473	MUL	acc1, hlp1, hlp0
474
475	MUL	const0, hlp1, t0
476	ADDS	t0, acc1, acc1
477	UMULH	const0, hlp0, t1
478
479	MUL	const1, hlp0, t0
480	ADCS	t0, acc2, acc2
481	UMULH	const1, hlp0, y0
482
483	MUL	const2, hlp0, t0
484	ADCS	t0, acc3, acc3
485	UMULH	const2, hlp0, acc1
486
487	MUL	const3, hlp0, t0
488	ADCS	t0, acc0, acc0
489
490	UMULH	const3, hlp0, hlp0
491	ADC	$0, hlp0
492
493	ADDS	t1, acc2, acc2
494	ADCS	y0, acc3, acc3
495	ADCS	acc1, acc0, acc0
496	ADC	$0, hlp0, acc1
497	// Third reduction step
498	MUL	acc2, hlp1, hlp0
499
500	MUL	const0, hlp1, t0
501	ADDS	t0, acc2, acc2
502	UMULH	const0, hlp0, t1
503
504	MUL	const1, hlp0, t0
505	ADCS	t0, acc3, acc3
506	UMULH	const1, hlp0, y0
507
508	MUL	const2, hlp0, t0
509	ADCS	t0, acc0, acc0
510	UMULH	const2, hlp0, acc2
511
512	MUL	const3, hlp0, t0
513	ADCS	t0, acc1, acc1
514
515	UMULH	const3, hlp0, hlp0
516	ADC	$0, hlp0
517
518	ADDS	t1, acc3, acc3
519	ADCS	y0, acc0, acc0
520	ADCS	acc2, acc1, acc1
521	ADC	$0, hlp0, acc2
522
523	// Last reduction step
524	MUL	acc3, hlp1, hlp0
525
526	MUL	const0, hlp1, t0
527	ADDS	t0, acc3, acc3
528	UMULH	const0, hlp0, t1
529
530	MUL	const1, hlp0, t0
531	ADCS	t0, acc0, acc0
532	UMULH	const1, hlp0, y0
533
534	MUL	const2, hlp0, t0
535	ADCS	t0, acc1, acc1
536	UMULH	const2, hlp0, acc3
537
538	MUL	const3, hlp0, t0
539	ADCS	t0, acc2, acc2
540
541	UMULH	const3, hlp0, hlp0
542	ADC	$0, acc7
543
544	ADDS	t1, acc0, acc0
545	ADCS	y0, acc1, acc1
546	ADCS	acc3, acc2, acc2
547	ADC	$0, hlp0, acc3
548
549	ADDS	acc4, acc0, acc0
550	ADCS	acc5, acc1, acc1
551	ADCS	acc6, acc2, acc2
552	ADCS	acc7, acc3, acc3
553	ADC	$0, ZR, acc4
554
555	SUBS	const0, acc0, y0
556	SBCS	const1, acc1, y1
557	SBCS	const2, acc2, y2
558	SBCS	const3, acc3, y3
559	SBCS	$0, acc4, acc4
560
561	CSEL	CS, y0, acc0, x0
562	CSEL	CS, y1, acc1, x1
563	CSEL	CS, y2, acc2, x2
564	CSEL	CS, y3, acc3, x3
565
566	CBNZ	b_ptr, ordSqrLoop
567
568	MOVD	res+0(FP), res_ptr
569	STP	(x0, x1), 0*16(res_ptr)
570	STP	(x2, x3), 1*16(res_ptr)
571
572	RET
573/* ---------------------------------------*/
574// func p256OrdMul(res, in1, in2 *p256OrdElement)
575TEXT ·p256OrdMul(SB),NOSPLIT,$0
576	MOVD	in1+8(FP), a_ptr
577	MOVD	in2+16(FP), b_ptr
578
579	MOVD	p256ordK0<>(SB), hlp1
580	LDP	p256ord<>+0x00(SB), (const0, const1)
581	LDP	p256ord<>+0x10(SB), (const2, const3)
582
583	LDP	0*16(a_ptr), (x0, x1)
584	LDP	1*16(a_ptr), (x2, x3)
585	LDP	0*16(b_ptr), (y0, y1)
586	LDP	1*16(b_ptr), (y2, y3)
587
588	// y[0] * x
589	MUL	y0, x0, acc0
590	UMULH	y0, x0, acc1
591
592	MUL	y0, x1, t0
593	ADDS	t0, acc1
594	UMULH	y0, x1, acc2
595
596	MUL	y0, x2, t0
597	ADCS	t0, acc2
598	UMULH	y0, x2, acc3
599
600	MUL	y0, x3, t0
601	ADCS	t0, acc3
602	UMULH	y0, x3, acc4
603	ADC	$0, acc4
604	// First reduction step
605	MUL	acc0, hlp1, hlp0
606
607	MUL	const0, hlp1, t0
608	ADDS	t0, acc0, acc0
609	UMULH	const0, hlp0, t1
610
611	MUL	const1, hlp0, t0
612	ADCS	t0, acc1, acc1
613	UMULH	const1, hlp0, y0
614
615	MUL	const2, hlp0, t0
616	ADCS	t0, acc2, acc2
617	UMULH	const2, hlp0, acc0
618
619	MUL	const3, hlp0, t0
620	ADCS	t0, acc3, acc3
621
622	UMULH	const3, hlp0, hlp0
623	ADC	$0, acc4
624
625	ADDS	t1, acc1, acc1
626	ADCS	y0, acc2, acc2
627	ADCS	acc0, acc3, acc3
628	ADC	$0, hlp0, acc0
629	// y[1] * x
630	MUL	y1, x0, t0
631	ADDS	t0, acc1
632	UMULH	y1, x0, t1
633
634	MUL	y1, x1, t0
635	ADCS	t0, acc2
636	UMULH	y1, x1, hlp0
637
638	MUL	y1, x2, t0
639	ADCS	t0, acc3
640	UMULH	y1, x2, y0
641
642	MUL	y1, x3, t0
643	ADCS	t0, acc4
644	UMULH	y1, x3, y1
645	ADC	$0, ZR, acc5
646
647	ADDS	t1, acc2
648	ADCS	hlp0, acc3
649	ADCS	y0, acc4
650	ADC	y1, acc5
651	// Second reduction step
652	MUL	acc1, hlp1, hlp0
653
654	MUL	const0, hlp1, t0
655	ADDS	t0, acc1, acc1
656	UMULH	const0, hlp0, t1
657
658	MUL	const1, hlp0, t0
659	ADCS	t0, acc2, acc2
660	UMULH	const1, hlp0, y0
661
662	MUL	const2, hlp0, t0
663	ADCS	t0, acc3, acc3
664	UMULH	const2, hlp0, acc1
665
666	MUL	const3, hlp0, t0
667	ADCS	t0, acc0, acc0
668
669	UMULH	const3, hlp0, hlp0
670	ADC	$0, acc5
671
672	ADDS	t1, acc2, acc2
673	ADCS	y0, acc3, acc3
674	ADCS	acc1, acc0, acc0
675	ADC	$0, hlp0, acc1
676	// y[2] * x
677	MUL	y2, x0, t0
678	ADDS	t0, acc2
679	UMULH	y2, x0, t1
680
681	MUL	y2, x1, t0
682	ADCS	t0, acc3
683	UMULH	y2, x1, hlp0
684
685	MUL	y2, x2, t0
686	ADCS	t0, acc4
687	UMULH	y2, x2, y0
688
689	MUL	y2, x3, t0
690	ADCS	t0, acc5
691	UMULH	y2, x3, y1
692	ADC	$0, ZR, acc6
693
694	ADDS	t1, acc3
695	ADCS	hlp0, acc4
696	ADCS	y0, acc5
697	ADC	y1, acc6
698	// Third reduction step
699	MUL	acc2, hlp1, hlp0
700
701	MUL	const0, hlp1, t0
702	ADDS	t0, acc2, acc2
703	UMULH	const0, hlp0, t1
704
705	MUL	const1, hlp0, t0
706	ADCS	t0, acc3, acc3
707	UMULH	const1, hlp0, y0
708
709	MUL	const2, hlp0, t0
710	ADCS	t0, acc0, acc0
711	UMULH	const2, hlp0, acc2
712
713	MUL	const3, hlp0, t0
714	ADCS	t0, acc1, acc1
715
716	UMULH	const3, hlp0, hlp0
717	ADC	$0, acc6
718
719	ADDS	t1, acc3, acc3
720	ADCS	y0, acc0, acc0
721	ADCS	acc2, acc1, acc1
722	ADC	$0, hlp0, acc2
723	// y[3] * x
724	MUL	y3, x0, t0
725	ADDS	t0, acc3
726	UMULH	y3, x0, t1
727
728	MUL	y3, x1, t0
729	ADCS	t0, acc4
730	UMULH	y3, x1, hlp0
731
732	MUL	y3, x2, t0
733	ADCS	t0, acc5
734	UMULH	y3, x2, y0
735
736	MUL	y3, x3, t0
737	ADCS	t0, acc6
738	UMULH	y3, x3, y1
739	ADC	$0, ZR, acc7
740
741	ADDS	t1, acc4
742	ADCS	hlp0, acc5
743	ADCS	y0, acc6
744	ADC	y1, acc7
745	// Last reduction step
746	MUL	acc3, hlp1, hlp0
747
748	MUL	const0, hlp1, t0
749	ADDS	t0, acc3, acc3
750	UMULH	const0, hlp0, t1
751
752	MUL	const1, hlp0, t0
753	ADCS	t0, acc0, acc0
754	UMULH	const1, hlp0, y0
755
756	MUL	const2, hlp0, t0
757	ADCS	t0, acc1, acc1
758	UMULH	const2, hlp0, acc3
759
760	MUL	const3, hlp0, t0
761	ADCS	t0, acc2, acc2
762
763	UMULH	const3, hlp0, hlp0
764	ADC	$0, acc7
765
766	ADDS	t1, acc0, acc0
767	ADCS	y0, acc1, acc1
768	ADCS	acc3, acc2, acc2
769	ADC	$0, hlp0, acc3
770
771	ADDS	acc4, acc0, acc0
772	ADCS	acc5, acc1, acc1
773	ADCS	acc6, acc2, acc2
774	ADCS	acc7, acc3, acc3
775	ADC	$0, ZR, acc4
776
777	SUBS	const0, acc0, t0
778	SBCS	const1, acc1, t1
779	SBCS	const2, acc2, t2
780	SBCS	const3, acc3, t3
781	SBCS	$0, acc4, acc4
782
783	CSEL	CS, t0, acc0, acc0
784	CSEL	CS, t1, acc1, acc1
785	CSEL	CS, t2, acc2, acc2
786	CSEL	CS, t3, acc3, acc3
787
788	MOVD	res+0(FP), res_ptr
789	STP	(acc0, acc1), 0*16(res_ptr)
790	STP	(acc2, acc3), 1*16(res_ptr)
791
792	RET
793/* ---------------------------------------*/
794TEXT p256SubInternal<>(SB),NOSPLIT,$0
795	SUBS	x0, y0, acc0
796	SBCS	x1, y1, acc1
797	SBCS	x2, y2, acc2
798	SBCS	x3, y3, acc3
799	SBC	$0, ZR, t0
800
801	ADDS	$-1, acc0, acc4
802	ADCS	const0, acc1, acc5
803	ADCS	$0, acc2, acc6
804	ADC	const1, acc3, acc7
805
806	ANDS	$1, t0
807	CSEL	EQ, acc0, acc4, x0
808	CSEL	EQ, acc1, acc5, x1
809	CSEL	EQ, acc2, acc6, x2
810	CSEL	EQ, acc3, acc7, x3
811
812	RET
813/* ---------------------------------------*/
814TEXT p256SqrInternal<>(SB),NOSPLIT,$0
815	// x[1:] * x[0]
816	MUL	x0, x1, acc1
817	UMULH	x0, x1, acc2
818
819	MUL	x0, x2, t0
820	ADDS	t0, acc2, acc2
821	UMULH	x0, x2, acc3
822
823	MUL	x0, x3, t0
824	ADCS	t0, acc3, acc3
825	UMULH	x0, x3, acc4
826	ADC	$0, acc4, acc4
827	// x[2:] * x[1]
828	MUL	x1, x2, t0
829	ADDS	t0, acc3
830	UMULH	x1, x2, t1
831	ADCS	t1, acc4
832	ADC	$0, ZR, acc5
833
834	MUL	x1, x3, t0
835	ADDS	t0, acc4
836	UMULH	x1, x3, t1
837	ADC	t1, acc5
838	// x[3] * x[2]
839	MUL	x2, x3, t0
840	ADDS	t0, acc5
841	UMULH	x2, x3, acc6
842	ADC	$0, acc6
843
844	MOVD	$0, acc7
845	// *2
846	ADDS	acc1, acc1
847	ADCS	acc2, acc2
848	ADCS	acc3, acc3
849	ADCS	acc4, acc4
850	ADCS	acc5, acc5
851	ADCS	acc6, acc6
852	ADC	$0, acc7
853	// Missing products
854	MUL	x0, x0, acc0
855	UMULH	x0, x0, t0
856	ADDS	t0, acc1, acc1
857
858	MUL	x1, x1, t0
859	ADCS	t0, acc2, acc2
860	UMULH	x1, x1, t1
861	ADCS	t1, acc3, acc3
862
863	MUL	x2, x2, t0
864	ADCS	t0, acc4, acc4
865	UMULH	x2, x2, t1
866	ADCS	t1, acc5, acc5
867
868	MUL	x3, x3, t0
869	ADCS	t0, acc6, acc6
870	UMULH	x3, x3, t1
871	ADCS	t1, acc7, acc7
872	// First reduction step
873	ADDS	acc0<<32, acc1, acc1
874	LSR	$32, acc0, t0
875	MUL	acc0, const1, t1
876	UMULH	acc0, const1, acc0
877	ADCS	t0, acc2, acc2
878	ADCS	t1, acc3, acc3
879	ADC	$0, acc0, acc0
880	// Second reduction step
881	ADDS	acc1<<32, acc2, acc2
882	LSR	$32, acc1, t0
883	MUL	acc1, const1, t1
884	UMULH	acc1, const1, acc1
885	ADCS	t0, acc3, acc3
886	ADCS	t1, acc0, acc0
887	ADC	$0, acc1, acc1
888	// Third reduction step
889	ADDS	acc2<<32, acc3, acc3
890	LSR	$32, acc2, t0
891	MUL	acc2, const1, t1
892	UMULH	acc2, const1, acc2
893	ADCS	t0, acc0, acc0
894	ADCS	t1, acc1, acc1
895	ADC	$0, acc2, acc2
896	// Last reduction step
897	ADDS	acc3<<32, acc0, acc0
898	LSR	$32, acc3, t0
899	MUL	acc3, const1, t1
900	UMULH	acc3, const1, acc3
901	ADCS	t0, acc1, acc1
902	ADCS	t1, acc2, acc2
903	ADC	$0, acc3, acc3
904	// Add bits [511:256] of the sqr result
905	ADDS	acc4, acc0, acc0
906	ADCS	acc5, acc1, acc1
907	ADCS	acc6, acc2, acc2
908	ADCS	acc7, acc3, acc3
909	ADC	$0, ZR, acc4
910
911	SUBS	$-1, acc0, t0
912	SBCS	const0, acc1, t1
913	SBCS	$0, acc2, t2
914	SBCS	const1, acc3, t3
915	SBCS	$0, acc4, acc4
916
917	CSEL	CS, t0, acc0, y0
918	CSEL	CS, t1, acc1, y1
919	CSEL	CS, t2, acc2, y2
920	CSEL	CS, t3, acc3, y3
921	RET
922/* ---------------------------------------*/
923TEXT p256MulInternal<>(SB),NOSPLIT,$0
924	// y[0] * x
925	MUL	y0, x0, acc0
926	UMULH	y0, x0, acc1
927
928	MUL	y0, x1, t0
929	ADDS	t0, acc1
930	UMULH	y0, x1, acc2
931
932	MUL	y0, x2, t0
933	ADCS	t0, acc2
934	UMULH	y0, x2, acc3
935
936	MUL	y0, x3, t0
937	ADCS	t0, acc3
938	UMULH	y0, x3, acc4
939	ADC	$0, acc4
940	// First reduction step
941	ADDS	acc0<<32, acc1, acc1
942	LSR	$32, acc0, t0
943	MUL	acc0, const1, t1
944	UMULH	acc0, const1, acc0
945	ADCS	t0, acc2
946	ADCS	t1, acc3
947	ADC	$0, acc0
948	// y[1] * x
949	MUL	y1, x0, t0
950	ADDS	t0, acc1
951	UMULH	y1, x0, t1
952
953	MUL	y1, x1, t0
954	ADCS	t0, acc2
955	UMULH	y1, x1, t2
956
957	MUL	y1, x2, t0
958	ADCS	t0, acc3
959	UMULH	y1, x2, t3
960
961	MUL	y1, x3, t0
962	ADCS	t0, acc4
963	UMULH	y1, x3, hlp0
964	ADC	$0, ZR, acc5
965
966	ADDS	t1, acc2
967	ADCS	t2, acc3
968	ADCS	t3, acc4
969	ADC	hlp0, acc5
970	// Second reduction step
971	ADDS	acc1<<32, acc2, acc2
972	LSR	$32, acc1, t0
973	MUL	acc1, const1, t1
974	UMULH	acc1, const1, acc1
975	ADCS	t0, acc3
976	ADCS	t1, acc0
977	ADC	$0, acc1
978	// y[2] * x
979	MUL	y2, x0, t0
980	ADDS	t0, acc2
981	UMULH	y2, x0, t1
982
983	MUL	y2, x1, t0
984	ADCS	t0, acc3
985	UMULH	y2, x1, t2
986
987	MUL	y2, x2, t0
988	ADCS	t0, acc4
989	UMULH	y2, x2, t3
990
991	MUL	y2, x3, t0
992	ADCS	t0, acc5
993	UMULH	y2, x3, hlp0
994	ADC	$0, ZR, acc6
995
996	ADDS	t1, acc3
997	ADCS	t2, acc4
998	ADCS	t3, acc5
999	ADC	hlp0, acc6
1000	// Third reduction step
1001	ADDS	acc2<<32, acc3, acc3
1002	LSR	$32, acc2, t0
1003	MUL	acc2, const1, t1
1004	UMULH	acc2, const1, acc2
1005	ADCS	t0, acc0
1006	ADCS	t1, acc1
1007	ADC	$0, acc2
1008	// y[3] * x
1009	MUL	y3, x0, t0
1010	ADDS	t0, acc3
1011	UMULH	y3, x0, t1
1012
1013	MUL	y3, x1, t0
1014	ADCS	t0, acc4
1015	UMULH	y3, x1, t2
1016
1017	MUL	y3, x2, t0
1018	ADCS	t0, acc5
1019	UMULH	y3, x2, t3
1020
1021	MUL	y3, x3, t0
1022	ADCS	t0, acc6
1023	UMULH	y3, x3, hlp0
1024	ADC	$0, ZR, acc7
1025
1026	ADDS	t1, acc4
1027	ADCS	t2, acc5
1028	ADCS	t3, acc6
1029	ADC	hlp0, acc7
1030	// Last reduction step
1031	ADDS	acc3<<32, acc0, acc0
1032	LSR	$32, acc3, t0
1033	MUL	acc3, const1, t1
1034	UMULH	acc3, const1, acc3
1035	ADCS	t0, acc1
1036	ADCS	t1, acc2
1037	ADC	$0, acc3
1038	// Add bits [511:256] of the mul result
1039	ADDS	acc4, acc0, acc0
1040	ADCS	acc5, acc1, acc1
1041	ADCS	acc6, acc2, acc2
1042	ADCS	acc7, acc3, acc3
1043	ADC	$0, ZR, acc4
1044
1045	SUBS	$-1, acc0, t0
1046	SBCS	const0, acc1, t1
1047	SBCS	$0, acc2, t2
1048	SBCS	const1, acc3, t3
1049	SBCS	$0, acc4, acc4
1050
1051	CSEL	CS, t0, acc0, y0
1052	CSEL	CS, t1, acc1, y1
1053	CSEL	CS, t2, acc2, y2
1054	CSEL	CS, t3, acc3, y3
1055	RET
1056/* ---------------------------------------*/
1057#define p256MulBy2Inline       \
1058	ADDS	y0, y0, x0;    \
1059	ADCS	y1, y1, x1;    \
1060	ADCS	y2, y2, x2;    \
1061	ADCS	y3, y3, x3;    \
1062	ADC	$0, ZR, hlp0;  \
1063	SUBS	$-1, x0, t0;   \
1064	SBCS	const0, x1, t1;\
1065	SBCS	$0, x2, t2;    \
1066	SBCS	const1, x3, t3;\
1067	SBCS	$0, hlp0, hlp0;\
1068	CSEL	CC, x0, t0, x0;\
1069	CSEL	CC, x1, t1, x1;\
1070	CSEL	CC, x2, t2, x2;\
1071	CSEL	CC, x3, t3, x3;
1072/* ---------------------------------------*/
1073#define x1in(off) (off)(a_ptr)
1074#define y1in(off) (off + 32)(a_ptr)
1075#define z1in(off) (off + 64)(a_ptr)
1076#define x2in(off) (off)(b_ptr)
1077#define z2in(off) (off + 64)(b_ptr)
1078#define x3out(off) (off)(res_ptr)
1079#define y3out(off) (off + 32)(res_ptr)
1080#define z3out(off) (off + 64)(res_ptr)
1081#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1082#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1083#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1084#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1085/* ---------------------------------------*/
1086#define y2in(off)  (32*0 + 8 + off)(RSP)
1087#define s2(off)    (32*1 + 8 + off)(RSP)
1088#define z1sqr(off) (32*2 + 8 + off)(RSP)
1089#define h(off)	   (32*3 + 8 + off)(RSP)
1090#define r(off)	   (32*4 + 8 + off)(RSP)
1091#define hsqr(off)  (32*5 + 8 + off)(RSP)
1092#define rsqr(off)  (32*6 + 8 + off)(RSP)
1093#define hcub(off)  (32*7 + 8 + off)(RSP)
1094
1095#define z2sqr(off) (32*8 + 8 + off)(RSP)
1096#define s1(off) (32*9 + 8 + off)(RSP)
1097#define u1(off) (32*10 + 8 + off)(RSP)
1098#define u2(off) (32*11 + 8 + off)(RSP)
1099
1100// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1101TEXT ·p256PointAddAffineAsm(SB),0,$264-48
1102	MOVD	in1+8(FP), a_ptr
1103	MOVD	in2+16(FP), b_ptr
1104	MOVD	sign+24(FP), hlp0
1105	MOVD	sel+32(FP), hlp1
1106	MOVD	zero+40(FP), t2
1107
1108	MOVD	$1, t0
1109	CMP	$0, t2
1110	CSEL	EQ, ZR, t0, t2
1111	CMP	$0, hlp1
1112	CSEL	EQ, ZR, t0, hlp1
1113
1114	MOVD	p256const0<>(SB), const0
1115	MOVD	p256const1<>(SB), const1
1116	EOR	t2<<1, hlp1
1117
1118	// Negate y2in based on sign
1119	LDP	2*16(b_ptr), (y0, y1)
1120	LDP	3*16(b_ptr), (y2, y3)
1121	MOVD	$-1, acc0
1122
1123	SUBS	y0, acc0, acc0
1124	SBCS	y1, const0, acc1
1125	SBCS	y2, ZR, acc2
1126	SBCS	y3, const1, acc3
1127	SBC	$0, ZR, t0
1128
1129	ADDS	$-1, acc0, acc4
1130	ADCS	const0, acc1, acc5
1131	ADCS	$0, acc2, acc6
1132	ADCS	const1, acc3, acc7
1133	ADC	$0, t0, t0
1134
1135	CMP	$0, t0
1136	CSEL	EQ, acc4, acc0, acc0
1137	CSEL	EQ, acc5, acc1, acc1
1138	CSEL	EQ, acc6, acc2, acc2
1139	CSEL	EQ, acc7, acc3, acc3
1140	// If condition is 0, keep original value
1141	CMP	$0, hlp0
1142	CSEL	EQ, y0, acc0, y0
1143	CSEL	EQ, y1, acc1, y1
1144	CSEL	EQ, y2, acc2, y2
1145	CSEL	EQ, y3, acc3, y3
1146	// Store result
1147	STy(y2in)
1148	// Begin point add
1149	LDx(z1in)
1150	CALL	p256SqrInternal<>(SB)    // z1ˆ2
1151	STy(z1sqr)
1152
1153	LDx(x2in)
1154	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
1155
1156	LDx(x1in)
1157	CALL	p256SubInternal<>(SB)    // h = u2 - u1
1158	STx(h)
1159
1160	LDy(z1in)
1161	CALL	p256MulInternal<>(SB)    // z3 = h * z1
1162
1163	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1164	LDP	5*16(a_ptr), (acc2, acc3)
1165	ANDS	$1, hlp1, ZR
1166	CSEL	EQ, acc0, y0, y0
1167	CSEL	EQ, acc1, y1, y1
1168	CSEL	EQ, acc2, y2, y2
1169	CSEL	EQ, acc3, y3, y3
1170	LDP	p256one<>+0x00(SB), (acc0, acc1)
1171	LDP	p256one<>+0x10(SB), (acc2, acc3)
1172	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
1173	CSEL	EQ, acc0, y0, y0
1174	CSEL	EQ, acc1, y1, y1
1175	CSEL	EQ, acc2, y2, y2
1176	CSEL	EQ, acc3, y3, y3
1177	LDx(z1in)
1178	MOVD	res+0(FP), t0
1179	STP	(y0, y1), 4*16(t0)
1180	STP	(y2, y3), 5*16(t0)
1181
1182	LDy(z1sqr)
1183	CALL	p256MulInternal<>(SB)    // z1 ^ 3
1184
1185	LDx(y2in)
1186	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
1187	STy(s2)
1188
1189	LDx(y1in)
1190	CALL	p256SubInternal<>(SB)    // r = s2 - s1
1191	STx(r)
1192
1193	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
1194	STy	(rsqr)
1195
1196	LDx(h)
1197	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
1198	STy(hsqr)
1199
1200	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
1201	STy(hcub)
1202
1203	LDx(y1in)
1204	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
1205	STy(s2)
1206
1207	LDP	hsqr(0*8), (x0, x1)
1208	LDP	hsqr(2*8), (x2, x3)
1209	LDP	0*16(a_ptr), (y0, y1)
1210	LDP	1*16(a_ptr), (y2, y3)
1211	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
1212	STP	(y0, y1), h(0*8)
1213	STP	(y2, y3), h(2*8)
1214
1215	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
1216
1217	LDy(rsqr)
1218	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
1219
1220	MOVD	x0, y0
1221	MOVD	x1, y1
1222	MOVD	x2, y2
1223	MOVD	x3, y3
1224	LDx(hcub)
1225	CALL	p256SubInternal<>(SB)
1226
1227	LDP	0*16(a_ptr), (acc0, acc1)
1228	LDP	1*16(a_ptr), (acc2, acc3)
1229	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
1230	CSEL	EQ, acc0, x0, x0
1231	CSEL	EQ, acc1, x1, x1
1232	CSEL	EQ, acc2, x2, x2
1233	CSEL	EQ, acc3, x3, x3
1234	LDP	0*16(b_ptr), (acc0, acc1)
1235	LDP	1*16(b_ptr), (acc2, acc3)
1236	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
1237	CSEL	EQ, acc0, x0, x0
1238	CSEL	EQ, acc1, x1, x1
1239	CSEL	EQ, acc2, x2, x2
1240	CSEL	EQ, acc3, x3, x3
1241	MOVD	res+0(FP), t0
1242	STP	(x0, x1), 0*16(t0)
1243	STP	(x2, x3), 1*16(t0)
1244
1245	LDP	h(0*8), (y0, y1)
1246	LDP	h(2*8), (y2, y3)
1247	CALL	p256SubInternal<>(SB)
1248
1249	LDP	r(0*8), (y0, y1)
1250	LDP	r(2*8), (y2, y3)
1251	CALL	p256MulInternal<>(SB)
1252
1253	LDP	s2(0*8), (x0, x1)
1254	LDP	s2(2*8), (x2, x3)
1255	CALL	p256SubInternal<>(SB)
1256	LDP	2*16(a_ptr), (acc0, acc1)
1257	LDP	3*16(a_ptr), (acc2, acc3)
1258	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
1259	CSEL	EQ, acc0, x0, x0
1260	CSEL	EQ, acc1, x1, x1
1261	CSEL	EQ, acc2, x2, x2
1262	CSEL	EQ, acc3, x3, x3
1263	LDP	y2in(0*8), (acc0, acc1)
1264	LDP	y2in(2*8), (acc2, acc3)
1265	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
1266	CSEL	EQ, acc0, x0, x0
1267	CSEL	EQ, acc1, x1, x1
1268	CSEL	EQ, acc2, x2, x2
1269	CSEL	EQ, acc3, x3, x3
1270	MOVD	res+0(FP), t0
1271	STP	(x0, x1), 2*16(t0)
1272	STP	(x2, x3), 3*16(t0)
1273
1274	RET
1275
1276#define p256AddInline          \
1277	ADDS	y0, x0, x0;    \
1278	ADCS	y1, x1, x1;    \
1279	ADCS	y2, x2, x2;    \
1280	ADCS	y3, x3, x3;    \
1281	ADC	$0, ZR, hlp0;  \
1282	SUBS	$-1, x0, t0;   \
1283	SBCS	const0, x1, t1;\
1284	SBCS	$0, x2, t2;    \
1285	SBCS	const1, x3, t3;\
1286	SBCS	$0, hlp0, hlp0;\
1287	CSEL	CC, x0, t0, x0;\
1288	CSEL	CC, x1, t1, x1;\
1289	CSEL	CC, x2, t2, x2;\
1290	CSEL	CC, x3, t3, x3;
1291
1292#define s(off)	(32*0 + 8 + off)(RSP)
1293#define m(off)	(32*1 + 8 + off)(RSP)
1294#define zsqr(off) (32*2 + 8 + off)(RSP)
1295#define tmp(off)  (32*3 + 8 + off)(RSP)
1296
1297//func p256PointDoubleAsm(res, in *P256Point)
1298TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
1299	MOVD	res+0(FP), res_ptr
1300	MOVD	in+8(FP), a_ptr
1301
1302	MOVD	p256const0<>(SB), const0
1303	MOVD	p256const1<>(SB), const1
1304
1305	// Begin point double
1306	LDP	4*16(a_ptr), (x0, x1)
1307	LDP	5*16(a_ptr), (x2, x3)
1308	CALL	p256SqrInternal<>(SB)
1309	STP	(y0, y1), zsqr(0*8)
1310	STP	(y2, y3), zsqr(2*8)
1311
1312	LDP	0*16(a_ptr), (x0, x1)
1313	LDP	1*16(a_ptr), (x2, x3)
1314	p256AddInline
1315	STx(m)
1316
1317	LDx(z1in)
1318	LDy(y1in)
1319	CALL	p256MulInternal<>(SB)
1320	p256MulBy2Inline
1321	STx(z3out)
1322
1323	LDy(x1in)
1324	LDx(zsqr)
1325	CALL	p256SubInternal<>(SB)
1326	LDy(m)
1327	CALL	p256MulInternal<>(SB)
1328
1329	// Multiply by 3
1330	p256MulBy2Inline
1331	p256AddInline
1332	STx(m)
1333
1334	LDy(y1in)
1335	p256MulBy2Inline
1336	CALL	p256SqrInternal<>(SB)
1337	STy(s)
1338	MOVD	y0, x0
1339	MOVD	y1, x1
1340	MOVD	y2, x2
1341	MOVD	y3, x3
1342	CALL	p256SqrInternal<>(SB)
1343
1344	// Divide by 2
1345	ADDS	$-1, y0, t0
1346	ADCS	const0, y1, t1
1347	ADCS	$0, y2, t2
1348	ADCS	const1, y3, t3
1349	ADC	$0, ZR, hlp0
1350
1351	ANDS	$1, y0, ZR
1352	CSEL	EQ, y0, t0, t0
1353	CSEL	EQ, y1, t1, t1
1354	CSEL	EQ, y2, t2, t2
1355	CSEL	EQ, y3, t3, t3
1356	AND	y0, hlp0, hlp0
1357
1358	EXTR	$1, t0, t1, y0
1359	EXTR	$1, t1, t2, y1
1360	EXTR	$1, t2, t3, y2
1361	EXTR	$1, t3, hlp0, y3
1362	STy(y3out)
1363
1364	LDx(x1in)
1365	LDy(s)
1366	CALL	p256MulInternal<>(SB)
1367	STy(s)
1368	p256MulBy2Inline
1369	STx(tmp)
1370
1371	LDx(m)
1372	CALL	p256SqrInternal<>(SB)
1373	LDx(tmp)
1374	CALL	p256SubInternal<>(SB)
1375
1376	STx(x3out)
1377
1378	LDy(s)
1379	CALL	p256SubInternal<>(SB)
1380
1381	LDy(m)
1382	CALL	p256MulInternal<>(SB)
1383
1384	LDx(y3out)
1385	CALL	p256SubInternal<>(SB)
1386	STx(y3out)
1387	RET
1388/* ---------------------------------------*/
1389#undef y2in
1390#undef x3out
1391#undef y3out
1392#undef z3out
1393#define y2in(off) (off + 32)(b_ptr)
1394#define x3out(off) (off)(b_ptr)
1395#define y3out(off) (off + 32)(b_ptr)
1396#define z3out(off) (off + 64)(b_ptr)
1397// func p256PointAddAsm(res, in1, in2 *P256Point) int
1398TEXT ·p256PointAddAsm(SB),0,$392-32
1399	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1400	// Move input to stack in order to free registers
1401	MOVD	in1+8(FP), a_ptr
1402	MOVD	in2+16(FP), b_ptr
1403
1404	MOVD	p256const0<>(SB), const0
1405	MOVD	p256const1<>(SB), const1
1406
1407	// Begin point add
1408	LDx(z2in)
1409	CALL	p256SqrInternal<>(SB)    // z2^2
1410	STy(z2sqr)
1411
1412	CALL	p256MulInternal<>(SB)    // z2^3
1413
1414	LDx(y1in)
1415	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
1416	STy(s1)
1417
1418	LDx(z1in)
1419	CALL	p256SqrInternal<>(SB)    // z1^2
1420	STy(z1sqr)
1421
1422	CALL	p256MulInternal<>(SB)    // z1^3
1423
1424	LDx(y2in)
1425	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
1426
1427	LDx(s1)
1428	CALL	p256SubInternal<>(SB)    // r = s2 - s1
1429	STx(r)
1430
1431	MOVD	$1, t2
1432	ORR	x0, x1, t0             // Check if zero mod p256
1433	ORR	x2, x3, t1
1434	ORR	t1, t0, t0
1435	CMP	$0, t0
1436	CSEL	EQ, t2, ZR, hlp1
1437
1438	EOR	$-1, x0, t0
1439	EOR	const0, x1, t1
1440	EOR	const1, x3, t3
1441
1442	ORR	t0, t1, t0
1443	ORR	x2, t3, t1
1444	ORR	t1, t0, t0
1445	CMP	$0, t0
1446	CSEL	EQ, t2, hlp1, hlp1
1447
1448	LDx(z2sqr)
1449	LDy(x1in)
1450	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
1451	STy(u1)
1452
1453	LDx(z1sqr)
1454	LDy(x2in)
1455	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
1456	STy(u2)
1457
1458	LDx(u1)
1459	CALL	p256SubInternal<>(SB)    // h = u2 - u1
1460	STx(h)
1461
1462	MOVD	$1, t2
1463	ORR	x0, x1, t0             // Check if zero mod p256
1464	ORR	x2, x3, t1
1465	ORR	t1, t0, t0
1466	CMP	$0, t0
1467	CSEL	EQ, t2, ZR, hlp0
1468
1469	EOR	$-1, x0, t0
1470	EOR	const0, x1, t1
1471	EOR	const1, x3, t3
1472
1473	ORR	t0, t1, t0
1474	ORR	x2, t3, t1
1475	ORR	t1, t0, t0
1476	CMP	$0, t0
1477	CSEL	EQ, t2, hlp0, hlp0
1478
1479	AND	hlp0, hlp1, hlp1
1480
1481	LDx(r)
1482	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
1483	STy(rsqr)
1484
1485	LDx(h)
1486	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
1487	STy(hsqr)
1488
1489	LDx(h)
1490	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
1491	STy(hcub)
1492
1493	LDx(s1)
1494	CALL	p256MulInternal<>(SB)
1495	STy(s2)
1496
1497	LDx(z1in)
1498	LDy(z2in)
1499	CALL	p256MulInternal<>(SB)    // z1 * z2
1500	LDx(h)
1501	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
1502	MOVD	res+0(FP), b_ptr
1503	STy(z3out)
1504
1505	LDx(hsqr)
1506	LDy(u1)
1507	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
1508	STy(u2)
1509
1510	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
1511	LDy(rsqr)
1512	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
1513
1514	MOVD	x0, y0
1515	MOVD	x1, y1
1516	MOVD	x2, y2
1517	MOVD	x3, y3
1518	LDx(hcub)
1519	CALL	p256SubInternal<>(SB)
1520	STx(x3out)
1521
1522	LDy(u2)
1523	CALL	p256SubInternal<>(SB)
1524
1525	LDy(r)
1526	CALL	p256MulInternal<>(SB)
1527
1528	LDx(s2)
1529	CALL	p256SubInternal<>(SB)
1530	STx(y3out)
1531
1532	MOVD	hlp1, R0
1533	MOVD	R0, ret+24(FP)
1534
1535	RET
1536