1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file contains the Go wrapper for the constant-time, 64-bit assembly
6// implementation of P256. The optimizations performed here are described in
7// detail in:
8// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
9//                          256-bit primes"
10// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
11// https://eprint.iacr.org/2013/816.pdf
12
13//go:build (amd64 || arm64 || ppc64le || s390x) && !purego
14
15package nistec
16
17import (
18	_ "embed"
19	"errors"
20	"internal/byteorder"
21	"math/bits"
22	"runtime"
23	"unsafe"
24)
25
26// p256Element is a P-256 base field element in [0, P-1] in the Montgomery
27// domain (with R 2²⁵⁶) as four limbs in little-endian order value.
28type p256Element [4]uint64
29
30// p256One is one in the Montgomery domain.
31var p256One = p256Element{0x0000000000000001, 0xffffffff00000000,
32	0xffffffffffffffff, 0x00000000fffffffe}
33
34var p256Zero = p256Element{}
35
36// p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain.
37var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff,
38	0x0000000000000000, 0xffffffff00000001}
39
40// P256Point is a P-256 point. The zero value should not be assumed to be valid
41// (although it is in this implementation).
42type P256Point struct {
43	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
44	// at infinity can be represented by any set of coordinates with Z = 0.
45	x, y, z p256Element
46}
47
48// NewP256Point returns a new P256Point representing the point at infinity.
49func NewP256Point() *P256Point {
50	return &P256Point{
51		x: p256One, y: p256One, z: p256Zero,
52	}
53}
54
55// SetGenerator sets p to the canonical generator and returns p.
56func (p *P256Point) SetGenerator() *P256Point {
57	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
58		0x79fb732b77622510, 0x18905f76a53755c6}
59	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
60		0xd2e88688dd21f325, 0x8571ff1825885d85}
61	p.z = p256One
62	return p
63}
64
65// Set sets p = q and returns p.
66func (p *P256Point) Set(q *P256Point) *P256Point {
67	p.x, p.y, p.z = q.x, q.y, q.z
68	return p
69}
70
71const p256ElementLength = 32
72const p256UncompressedLength = 1 + 2*p256ElementLength
73const p256CompressedLength = 1 + p256ElementLength
74
75// SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
76// b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
77// the curve, it returns nil and an error, and the receiver is unchanged.
78// Otherwise, it returns p.
79func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
80	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
81	// here is R in the Montgomery domain, or R×R mod p. See comment in
82	// P256OrdInverse about how this is used.
83	rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
84		0xfffffffffffffffe, 0x00000004fffffffd}
85
86	switch {
87	// Point at infinity.
88	case len(b) == 1 && b[0] == 0:
89		return p.Set(NewP256Point()), nil
90
91	// Uncompressed form.
92	case len(b) == p256UncompressedLength && b[0] == 4:
93		var r P256Point
94		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
95		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
96		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
97			return nil, errors.New("invalid P256 element encoding")
98		}
99		p256Mul(&r.x, &r.x, &rr)
100		p256Mul(&r.y, &r.y, &rr)
101		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
102			return nil, err
103		}
104		r.z = p256One
105		return p.Set(&r), nil
106
107	// Compressed form.
108	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
109		var r P256Point
110		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
111		if p256LessThanP(&r.x) == 0 {
112			return nil, errors.New("invalid P256 element encoding")
113		}
114		p256Mul(&r.x, &r.x, &rr)
115
116		// y² = x³ - 3x + b
117		p256Polynomial(&r.y, &r.x)
118		if !p256Sqrt(&r.y, &r.y) {
119			return nil, errors.New("invalid P256 compressed point encoding")
120		}
121
122		// Select the positive or negative root, as indicated by the least
123		// significant bit, based on the encoding type byte.
124		yy := new(p256Element)
125		p256FromMont(yy, &r.y)
126		cond := int(yy[0]&1) ^ int(b[0]&1)
127		p256NegCond(&r.y, cond)
128
129		r.z = p256One
130		return p.Set(&r), nil
131
132	default:
133		return nil, errors.New("invalid P256 point encoding")
134	}
135}
136
137// p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
138func p256Polynomial(y2, x *p256Element) *p256Element {
139	x3 := new(p256Element)
140	p256Sqr(x3, x, 1)
141	p256Mul(x3, x3, x)
142
143	threeX := new(p256Element)
144	p256Add(threeX, x, x)
145	p256Add(threeX, threeX, x)
146	p256NegCond(threeX, 1)
147
148	p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090,
149		0xe5a220abf7212ed6, 0xdc30061d04874834}
150
151	p256Add(x3, x3, threeX)
152	p256Add(x3, x3, p256B)
153
154	*y2 = *x3
155	return y2
156}
157
158func p256CheckOnCurve(x, y *p256Element) error {
159	// y² = x³ - 3x + b
160	rhs := p256Polynomial(new(p256Element), x)
161	lhs := new(p256Element)
162	p256Sqr(lhs, y, 1)
163	if p256Equal(lhs, rhs) != 1 {
164		return errors.New("P256 point not on curve")
165	}
166	return nil
167}
168
169// p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
170// not allowed to be equal to or greater than p, so if this function returns 0
171// then x is invalid.
172func p256LessThanP(x *p256Element) int {
173	var b uint64
174	_, b = bits.Sub64(x[0], p256P[0], b)
175	_, b = bits.Sub64(x[1], p256P[1], b)
176	_, b = bits.Sub64(x[2], p256P[2], b)
177	_, b = bits.Sub64(x[3], p256P[3], b)
178	return int(b)
179}
180
181// p256Add sets res = x + y.
182func p256Add(res, x, y *p256Element) {
183	var c, b uint64
184	t1 := make([]uint64, 4)
185	t1[0], c = bits.Add64(x[0], y[0], 0)
186	t1[1], c = bits.Add64(x[1], y[1], c)
187	t1[2], c = bits.Add64(x[2], y[2], c)
188	t1[3], c = bits.Add64(x[3], y[3], c)
189	t2 := make([]uint64, 4)
190	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
191	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
192	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
193	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
194	// Three options:
195	//   - a+b < p
196	//     then c is 0, b is 1, and t1 is correct
197	//   - p <= a+b < 2^256
198	//     then c is 0, b is 0, and t2 is correct
199	//   - 2^256 <= a+b
200	//     then c is 1, b is 1, and t2 is correct
201	t2Mask := (c ^ b) - 1
202	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
203	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
204	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
205	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
206}
207
208// p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
209// false and e is unchanged. e and x can overlap.
210func p256Sqrt(e, x *p256Element) (isSquare bool) {
211	t0, t1 := new(p256Element), new(p256Element)
212
213	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
214	//
215	// The sequence of 7 multiplications and 253 squarings is derived from the
216	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
217	//
218	//	_10       = 2*1
219	//	_11       = 1 + _10
220	//	_1100     = _11 << 2
221	//	_1111     = _11 + _1100
222	//	_11110000 = _1111 << 4
223	//	_11111111 = _1111 + _11110000
224	//	x16       = _11111111 << 8 + _11111111
225	//	x32       = x16 << 16 + x16
226	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
227	//
228	p256Sqr(t0, x, 1)
229	p256Mul(t0, x, t0)
230	p256Sqr(t1, t0, 2)
231	p256Mul(t0, t0, t1)
232	p256Sqr(t1, t0, 4)
233	p256Mul(t0, t0, t1)
234	p256Sqr(t1, t0, 8)
235	p256Mul(t0, t0, t1)
236	p256Sqr(t1, t0, 16)
237	p256Mul(t0, t0, t1)
238	p256Sqr(t0, t0, 32)
239	p256Mul(t0, x, t0)
240	p256Sqr(t0, t0, 96)
241	p256Mul(t0, x, t0)
242	p256Sqr(t0, t0, 94)
243
244	p256Sqr(t1, t0, 1)
245	if p256Equal(t1, x) != 1 {
246		return false
247	}
248	*e = *t0
249	return true
250}
251
252// The following assembly functions are implemented in p256_asm_*.s
253
254// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
255//
256//go:noescape
257func p256Mul(res, in1, in2 *p256Element)
258
259// Montgomery square, repeated n times (n >= 1).
260//
261//go:noescape
262func p256Sqr(res, in *p256Element, n int)
263
264// Montgomery multiplication by R⁻¹, or 1 outside the domain.
265// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
266//
267//go:noescape
268func p256FromMont(res, in *p256Element)
269
270// If cond is not 0, sets val = -val mod p.
271//
272//go:noescape
273func p256NegCond(val *p256Element, cond int)
274
275// If cond is 0, sets res = b, otherwise sets res = a.
276//
277//go:noescape
278func p256MovCond(res, a, b *P256Point, cond int)
279
280//go:noescape
281func p256BigToLittle(res *p256Element, in *[32]byte)
282
283//go:noescape
284func p256LittleToBig(res *[32]byte, in *p256Element)
285
286//go:noescape
287func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
288
289//go:noescape
290func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
291
292// p256Table is a table of the first 16 multiples of a point. Points are stored
293// at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
294// [0]P is the point at infinity and it's not stored.
295type p256Table [16]P256Point
296
297// p256Select sets res to the point at index idx in the table.
298// idx must be in [0, 15]. It executes in constant time.
299//
300//go:noescape
301func p256Select(res *P256Point, table *p256Table, idx int)
302
303// p256AffinePoint is a point in affine coordinates (x, y). x and y are still
304// Montgomery domain elements. The point can't be the point at infinity.
305type p256AffinePoint struct {
306	x, y p256Element
307}
308
309// p256AffineTable is a table of the first 32 multiples of a point. Points are
310// stored at an index offset of -1 like in p256Table, and [0]P is not stored.
311type p256AffineTable [32]p256AffinePoint
312
313// p256Precomputed is a series of precomputed multiples of G, the canonical
314// generator. The first p256AffineTable contains multiples of G. The second one
315// multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
316// table is the previous table doubled six times. Six is the width of the
317// sliding window used in p256ScalarMult, and having each table already
318// pre-doubled lets us avoid the doublings between windows entirely. This table
319// MUST NOT be modified, as it aliases into p256PrecomputedEmbed below.
320var p256Precomputed *[43]p256AffineTable
321
322//go:embed p256_asm_table.bin
323var p256PrecomputedEmbed string
324
325func init() {
326	p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed))
327	if runtime.GOARCH == "s390x" {
328		var newTable [43 * 32 * 2 * 4]uint64
329		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) {
330			newTable[i] = byteorder.LeUint64(x[:])
331		}
332		newTablePtr := unsafe.Pointer(&newTable)
333		p256PrecomputedPtr = &newTablePtr
334	}
335	p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr)
336}
337
338// p256SelectAffine sets res to the point at index idx in the table.
339// idx must be in [0, 31]. It executes in constant time.
340//
341//go:noescape
342func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
343
344// Point addition with an affine point and constant time conditions.
345// If zero is 0, sets res = in2. If sel is 0, sets res = in1.
346// If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2
347//
348//go:noescape
349func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
350
351// Point addition. Sets res = in1 + in2. Returns one if the two input points
352// were equal and zero otherwise. If in1 or in2 are the point at infinity, res
353// and the return value are undefined.
354//
355//go:noescape
356func p256PointAddAsm(res, in1, in2 *P256Point) int
357
358// Point doubling. Sets res = in + in. in can be the point at infinity.
359//
360//go:noescape
361func p256PointDoubleAsm(res, in *P256Point)
362
363// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
364// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
365type p256OrdElement [4]uint64
366
367// p256OrdReduce ensures s is in the range [0, ord(G)-1].
368func p256OrdReduce(s *p256OrdElement) {
369	// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
370	// keeping the result if it doesn't underflow.
371	t0, b := bits.Sub64(s[0], 0xf3b9cac2fc632551, 0)
372	t1, b := bits.Sub64(s[1], 0xbce6faada7179e84, b)
373	t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
374	t3, b := bits.Sub64(s[3], 0xffffffff00000000, b)
375	tMask := b - 1 // zero if subtraction underflowed
376	s[0] ^= (t0 ^ s[0]) & tMask
377	s[1] ^= (t1 ^ s[1]) & tMask
378	s[2] ^= (t2 ^ s[2]) & tMask
379	s[3] ^= (t3 ^ s[3]) & tMask
380}
381
382// Add sets q = p1 + p2, and returns q. The points may overlap.
383func (q *P256Point) Add(r1, r2 *P256Point) *P256Point {
384	var sum, double P256Point
385	r1IsInfinity := r1.isInfinity()
386	r2IsInfinity := r2.isInfinity()
387	pointsEqual := p256PointAddAsm(&sum, r1, r2)
388	p256PointDoubleAsm(&double, r1)
389	p256MovCond(&sum, &double, &sum, pointsEqual)
390	p256MovCond(&sum, r1, &sum, r2IsInfinity)
391	p256MovCond(&sum, r2, &sum, r1IsInfinity)
392	return q.Set(&sum)
393}
394
395// Double sets q = p + p, and returns q. The points may overlap.
396func (q *P256Point) Double(p *P256Point) *P256Point {
397	var double P256Point
398	p256PointDoubleAsm(&double, p)
399	return q.Set(&double)
400}
401
402// ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
403// endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
404// returns an error and the receiver is unchanged.
405func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
406	if len(scalar) != 32 {
407		return nil, errors.New("invalid scalar length")
408	}
409	scalarReversed := new(p256OrdElement)
410	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
411	p256OrdReduce(scalarReversed)
412
413	r.p256BaseMult(scalarReversed)
414	return r, nil
415}
416
417// ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
418// and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
419// error and the receiver is unchanged.
420func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
421	if len(scalar) != 32 {
422		return nil, errors.New("invalid scalar length")
423	}
424	scalarReversed := new(p256OrdElement)
425	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
426	p256OrdReduce(scalarReversed)
427
428	r.Set(q).p256ScalarMult(scalarReversed)
429	return r, nil
430}
431
432// uint64IsZero returns 1 if x is zero and zero otherwise.
433func uint64IsZero(x uint64) int {
434	x = ^x
435	x &= x >> 32
436	x &= x >> 16
437	x &= x >> 8
438	x &= x >> 4
439	x &= x >> 2
440	x &= x >> 1
441	return int(x & 1)
442}
443
444// p256Equal returns 1 if a and b are equal and 0 otherwise.
445func p256Equal(a, b *p256Element) int {
446	var acc uint64
447	for i := range a {
448		acc |= a[i] ^ b[i]
449	}
450	return uint64IsZero(acc)
451}
452
453// isInfinity returns 1 if p is the point at infinity and 0 otherwise.
454func (p *P256Point) isInfinity() int {
455	return p256Equal(&p.z, &p256Zero)
456}
457
458// Bytes returns the uncompressed or infinity encoding of p, as specified in
459// SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
460// infinity is shorter than all other encodings.
461func (p *P256Point) Bytes() []byte {
462	// This function is outlined to make the allocations inline in the caller
463	// rather than happen on the heap.
464	var out [p256UncompressedLength]byte
465	return p.bytes(&out)
466}
467
468func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
469	// The proper representation of the point at infinity is a single zero byte.
470	if p.isInfinity() == 1 {
471		return append(out[:0], 0)
472	}
473
474	x, y := new(p256Element), new(p256Element)
475	p.affineFromMont(x, y)
476
477	out[0] = 4 // Uncompressed form.
478	p256LittleToBig((*[32]byte)(out[1:33]), x)
479	p256LittleToBig((*[32]byte)(out[33:65]), y)
480
481	return out[:]
482}
483
484// affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
485// Montgomery domain.
486func (p *P256Point) affineFromMont(x, y *p256Element) {
487	p256Inverse(y, &p.z)
488	p256Sqr(x, y, 1)
489	p256Mul(y, y, x)
490
491	p256Mul(x, &p.x, x)
492	p256Mul(y, &p.y, y)
493
494	p256FromMont(x, x)
495	p256FromMont(y, y)
496}
497
498// BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
499// Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
500func (p *P256Point) BytesX() ([]byte, error) {
501	// This function is outlined to make the allocations inline in the caller
502	// rather than happen on the heap.
503	var out [p256ElementLength]byte
504	return p.bytesX(&out)
505}
506
507func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
508	if p.isInfinity() == 1 {
509		return nil, errors.New("P256 point is the point at infinity")
510	}
511
512	x := new(p256Element)
513	p256Inverse(x, &p.z)
514	p256Sqr(x, x, 1)
515	p256Mul(x, &p.x, x)
516	p256FromMont(x, x)
517	p256LittleToBig((*[32]byte)(out[:]), x)
518
519	return out[:], nil
520}
521
522// BytesCompressed returns the compressed or infinity encoding of p, as
523// specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
524// point at infinity is shorter than all other encodings.
525func (p *P256Point) BytesCompressed() []byte {
526	// This function is outlined to make the allocations inline in the caller
527	// rather than happen on the heap.
528	var out [p256CompressedLength]byte
529	return p.bytesCompressed(&out)
530}
531
532func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
533	if p.isInfinity() == 1 {
534		return append(out[:0], 0)
535	}
536
537	x, y := new(p256Element), new(p256Element)
538	p.affineFromMont(x, y)
539
540	out[0] = 2 | byte(y[0]&1)
541	p256LittleToBig((*[32]byte)(out[1:33]), x)
542
543	return out[:]
544}
545
546// Select sets q to p1 if cond == 1, and to p2 if cond == 0.
547func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
548	p256MovCond(q, p1, p2, cond)
549	return q
550}
551
552// p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
553func p256Inverse(out, in *p256Element) {
554	// Inversion is calculated through exponentiation by p - 2, per Fermat's
555	// little theorem.
556	//
557	// The sequence of 12 multiplications and 255 squarings is derived from the
558	// following addition chain generated with github.com/mmcloughlin/addchain
559	// v0.4.0.
560	//
561	//  _10     = 2*1
562	//  _11     = 1 + _10
563	//  _110    = 2*_11
564	//  _111    = 1 + _110
565	//  _111000 = _111 << 3
566	//  _111111 = _111 + _111000
567	//  x12     = _111111 << 6 + _111111
568	//  x15     = x12 << 3 + _111
569	//  x16     = 2*x15 + 1
570	//  x32     = x16 << 16 + x16
571	//  i53     = x32 << 15
572	//  x47     = x15 + i53
573	//  i263    = ((i53 << 17 + 1) << 143 + x47) << 47
574	//  return    (x47 + i263) << 2 + 1
575	//
576	var z = new(p256Element)
577	var t0 = new(p256Element)
578	var t1 = new(p256Element)
579
580	p256Sqr(z, in, 1)
581	p256Mul(z, in, z)
582	p256Sqr(z, z, 1)
583	p256Mul(z, in, z)
584	p256Sqr(t0, z, 3)
585	p256Mul(t0, z, t0)
586	p256Sqr(t1, t0, 6)
587	p256Mul(t0, t0, t1)
588	p256Sqr(t0, t0, 3)
589	p256Mul(z, z, t0)
590	p256Sqr(t0, z, 1)
591	p256Mul(t0, in, t0)
592	p256Sqr(t1, t0, 16)
593	p256Mul(t0, t0, t1)
594	p256Sqr(t0, t0, 15)
595	p256Mul(z, z, t0)
596	p256Sqr(t0, t0, 17)
597	p256Mul(t0, in, t0)
598	p256Sqr(t0, t0, 143)
599	p256Mul(t0, z, t0)
600	p256Sqr(t0, t0, 47)
601	p256Mul(z, z, t0)
602	p256Sqr(z, z, 2)
603	p256Mul(out, in, z)
604}
605
606func boothW5(in uint) (int, int) {
607	var s uint = ^((in >> 5) - 1)
608	var d uint = (1 << 6) - in - 1
609	d = (d & s) | (in & (^s))
610	d = (d >> 1) + (d & 1)
611	return int(d), int(s & 1)
612}
613
614func boothW6(in uint) (int, int) {
615	var s uint = ^((in >> 6) - 1)
616	var d uint = (1 << 7) - in - 1
617	d = (d & s) | (in & (^s))
618	d = (d >> 1) + (d & 1)
619	return int(d), int(s & 1)
620}
621
622func (p *P256Point) p256BaseMult(scalar *p256OrdElement) {
623	var t0 p256AffinePoint
624
625	wvalue := (scalar[0] << 1) & 0x7f
626	sel, sign := boothW6(uint(wvalue))
627	p256SelectAffine(&t0, &p256Precomputed[0], sel)
628	p.x, p.y, p.z = t0.x, t0.y, p256One
629	p256NegCond(&p.y, sign)
630
631	index := uint(5)
632	zero := sel
633
634	for i := 1; i < 43; i++ {
635		if index < 192 {
636			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
637		} else {
638			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
639		}
640		index += 6
641		sel, sign = boothW6(uint(wvalue))
642		p256SelectAffine(&t0, &p256Precomputed[i], sel)
643		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
644		zero |= sel
645	}
646
647	// If the whole scalar was zero, set to the point at infinity.
648	p256MovCond(p, p, NewP256Point(), zero)
649}
650
651func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) {
652	// precomp is a table of precomputed points that stores powers of p
653	// from p^1 to p^16.
654	var precomp p256Table
655	var t0, t1, t2, t3 P256Point
656
657	// Prepare the table
658	precomp[0] = *p // 1
659
660	p256PointDoubleAsm(&t0, p)
661	p256PointDoubleAsm(&t1, &t0)
662	p256PointDoubleAsm(&t2, &t1)
663	p256PointDoubleAsm(&t3, &t2)
664	precomp[1] = t0  // 2
665	precomp[3] = t1  // 4
666	precomp[7] = t2  // 8
667	precomp[15] = t3 // 16
668
669	p256PointAddAsm(&t0, &t0, p)
670	p256PointAddAsm(&t1, &t1, p)
671	p256PointAddAsm(&t2, &t2, p)
672	precomp[2] = t0 // 3
673	precomp[4] = t1 // 5
674	precomp[8] = t2 // 9
675
676	p256PointDoubleAsm(&t0, &t0)
677	p256PointDoubleAsm(&t1, &t1)
678	precomp[5] = t0 // 6
679	precomp[9] = t1 // 10
680
681	p256PointAddAsm(&t2, &t0, p)
682	p256PointAddAsm(&t1, &t1, p)
683	precomp[6] = t2  // 7
684	precomp[10] = t1 // 11
685
686	p256PointDoubleAsm(&t0, &t0)
687	p256PointDoubleAsm(&t2, &t2)
688	precomp[11] = t0 // 12
689	precomp[13] = t2 // 14
690
691	p256PointAddAsm(&t0, &t0, p)
692	p256PointAddAsm(&t2, &t2, p)
693	precomp[12] = t0 // 13
694	precomp[14] = t2 // 15
695
696	// Start scanning the window from top bit
697	index := uint(254)
698	var sel, sign int
699
700	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
701	sel, _ = boothW5(uint(wvalue))
702
703	p256Select(p, &precomp, sel)
704	zero := sel
705
706	for index > 4 {
707		index -= 5
708		p256PointDoubleAsm(p, p)
709		p256PointDoubleAsm(p, p)
710		p256PointDoubleAsm(p, p)
711		p256PointDoubleAsm(p, p)
712		p256PointDoubleAsm(p, p)
713
714		if index < 192 {
715			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
716		} else {
717			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
718		}
719
720		sel, sign = boothW5(uint(wvalue))
721
722		p256Select(&t0, &precomp, sel)
723		p256NegCond(&t0.y, sign)
724		p256PointAddAsm(&t1, p, &t0)
725		p256MovCond(&t1, &t1, p, sel)
726		p256MovCond(p, &t1, &t0, zero)
727		zero |= sel
728	}
729
730	p256PointDoubleAsm(p, p)
731	p256PointDoubleAsm(p, p)
732	p256PointDoubleAsm(p, p)
733	p256PointDoubleAsm(p, p)
734	p256PointDoubleAsm(p, p)
735
736	wvalue = (scalar[0] << 1) & 0x3f
737	sel, sign = boothW5(uint(wvalue))
738
739	p256Select(&t0, &precomp, sel)
740	p256NegCond(&t0.y, sign)
741	p256PointAddAsm(&t1, p, &t0)
742	p256MovCond(&t1, &t1, p, sel)
743	p256MovCond(p, &t1, &t0, zero)
744}
745