xref: /aosp_15_r20/external/vboot_reference/firmware/2lib/2modpow_sse2.c (revision 8617a60d3594060b7ecbd21bc622a7c14f3cf2bc)
1*8617a60dSAndroid Build Coastguard Worker /* Copyright 2023 The ChromiumOS Authors
2*8617a60dSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license that can be
3*8617a60dSAndroid Build Coastguard Worker  * found in the LICENSE file.
4*8617a60dSAndroid Build Coastguard Worker  *
5*8617a60dSAndroid Build Coastguard Worker  * Authors: Muhammad Monir Hossain <[email protected]>
6*8617a60dSAndroid Build Coastguard Worker  *          Jeremy Compostella <[email protected]>
7*8617a60dSAndroid Build Coastguard Worker  */
8*8617a60dSAndroid Build Coastguard Worker 
9*8617a60dSAndroid Build Coastguard Worker /*
10*8617a60dSAndroid Build Coastguard Worker  * The algorithm implemented below is described in Montgomery Multiplication
11*8617a60dSAndroid Build Coastguard Worker  * Using Vector Instructions document from Microsoft Research, August 20, 2013
12*8617a60dSAndroid Build Coastguard Worker  * (cf. https://eprint.iacr.org/2013/519.pdf).
13*8617a60dSAndroid Build Coastguard Worker  *
14*8617a60dSAndroid Build Coastguard Worker  * This implementation leverages SSE2 instructions to perform arithmetic
15*8617a60dSAndroid Build Coastguard Worker  * operations in parallel.
16*8617a60dSAndroid Build Coastguard Worker  *
17*8617a60dSAndroid Build Coastguard Worker  * This algorithm uses the modulus positive inverse (1 / N mod 2^32) which can
18*8617a60dSAndroid Build Coastguard Worker  * be easily computed from the modulus negative inverse provided by the public
19*8617a60dSAndroid Build Coastguard Worker  * key data structure `n0inv' field.
20*8617a60dSAndroid Build Coastguard Worker  */
21*8617a60dSAndroid Build Coastguard Worker 
22*8617a60dSAndroid Build Coastguard Worker #include "2api.h"
23*8617a60dSAndroid Build Coastguard Worker #include "2common.h"
24*8617a60dSAndroid Build Coastguard Worker #include "2return_codes.h"
25*8617a60dSAndroid Build Coastguard Worker #include "2rsa.h"
26*8617a60dSAndroid Build Coastguard Worker 
27*8617a60dSAndroid Build Coastguard Worker typedef long long vb2_m128i __attribute__((__vector_size__(16), __may_alias__));
28*8617a60dSAndroid Build Coastguard Worker typedef int vb2_v4si __attribute__((__vector_size__(16)));
29*8617a60dSAndroid Build Coastguard Worker typedef unsigned long long vb2_v2du __attribute__((__vector_size__(16)));
30*8617a60dSAndroid Build Coastguard Worker 
31*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_set_epi32(int q3,int q2,int q1,int q0)32*8617a60dSAndroid Build Coastguard Worker vb2_set_epi32 (int q3, int q2, int q1, int q0)
33*8617a60dSAndroid Build Coastguard Worker {
34*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)(vb2_v4si){ q0, q1, q2, q3 };
35*8617a60dSAndroid Build Coastguard Worker }
36*8617a60dSAndroid Build Coastguard Worker 
37*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_setzero_si128(void)38*8617a60dSAndroid Build Coastguard Worker vb2_setzero_si128 (void)
39*8617a60dSAndroid Build Coastguard Worker {
40*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)(vb2_v4si){ 0, 0, 0, 0 };
41*8617a60dSAndroid Build Coastguard Worker }
42*8617a60dSAndroid Build Coastguard Worker 
43*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_add_epi64(vb2_m128i a,vb2_m128i b)44*8617a60dSAndroid Build Coastguard Worker vb2_add_epi64 (vb2_m128i a, vb2_m128i b)
45*8617a60dSAndroid Build Coastguard Worker {
46*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)((vb2_v2du)a + (vb2_v2du)b);
47*8617a60dSAndroid Build Coastguard Worker }
48*8617a60dSAndroid Build Coastguard Worker 
49*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_srli_epi64(vb2_m128i a,int b)50*8617a60dSAndroid Build Coastguard Worker vb2_srli_epi64 (vb2_m128i a, int b)
51*8617a60dSAndroid Build Coastguard Worker {
52*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)__builtin_ia32_psrlqi128(a, b);
53*8617a60dSAndroid Build Coastguard Worker }
54*8617a60dSAndroid Build Coastguard Worker 
55*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_mul_epu32(vb2_m128i a,vb2_m128i b)56*8617a60dSAndroid Build Coastguard Worker vb2_mul_epu32 (vb2_m128i a, vb2_m128i b)
57*8617a60dSAndroid Build Coastguard Worker {
58*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)__builtin_ia32_pmuludq128((vb2_v4si)a, (vb2_v4si)b);
59*8617a60dSAndroid Build Coastguard Worker }
60*8617a60dSAndroid Build Coastguard Worker 
61*8617a60dSAndroid Build Coastguard Worker static inline vb2_m128i __attribute__((__always_inline__))
vb2_and_si128(vb2_m128i a,vb2_m128i b)62*8617a60dSAndroid Build Coastguard Worker vb2_and_si128 (vb2_m128i a, vb2_m128i b)
63*8617a60dSAndroid Build Coastguard Worker {
64*8617a60dSAndroid Build Coastguard Worker 	return (vb2_m128i)((vb2_v2du)a & (vb2_v2du)b);
65*8617a60dSAndroid Build Coastguard Worker }
66*8617a60dSAndroid Build Coastguard Worker 
67*8617a60dSAndroid Build Coastguard Worker /**
68*8617a60dSAndroid Build Coastguard Worker  * Montgomery c[] = d[] - e[] if d[] > e[], c[] = d[] - e[] + mod[] otherwise.
69*8617a60dSAndroid Build Coastguard Worker  *
70*8617a60dSAndroid Build Coastguard Worker  * de[] has d[] in lower 64 bits (effectively lower 32 bits) and e[] in upper
71*8617a60dSAndroid Build Coastguard Worker  * 64 bits (effectively lower 32 bits)
72*8617a60dSAndroid Build Coastguard Worker  * de[] is used as a temporary buffer and therefore its content will be lost.
73*8617a60dSAndroid Build Coastguard Worker  */
sub_mod(const struct vb2_public_key * key,vb2_m128i * de,uint32_t * c)74*8617a60dSAndroid Build Coastguard Worker static void sub_mod(const struct vb2_public_key *key, vb2_m128i *de, uint32_t *c)
75*8617a60dSAndroid Build Coastguard Worker {
76*8617a60dSAndroid Build Coastguard Worker 	uint32_t i, borrow = 0, carry = 0, d, e;
77*8617a60dSAndroid Build Coastguard Worker 	uint64_t sum, *de_i;
78*8617a60dSAndroid Build Coastguard Worker 
79*8617a60dSAndroid Build Coastguard Worker 	for (i = 0; i < key->arrsize; i++) {
80*8617a60dSAndroid Build Coastguard Worker 		de_i = (uint64_t *)&de[i];
81*8617a60dSAndroid Build Coastguard Worker 		d = (uint32_t)de_i[1];
82*8617a60dSAndroid Build Coastguard Worker 		e = (uint32_t)de_i[0];
83*8617a60dSAndroid Build Coastguard Worker 
84*8617a60dSAndroid Build Coastguard Worker 		/* Use de_i[0] as temporary storage of d[] - e[]. */
85*8617a60dSAndroid Build Coastguard Worker 		de_i[0] = (uint32_t)d - e - borrow;
86*8617a60dSAndroid Build Coastguard Worker 
87*8617a60dSAndroid Build Coastguard Worker 		borrow = d ^ ((d ^ e) | (d ^ (uint32_t)de_i[0]));
88*8617a60dSAndroid Build Coastguard Worker 		borrow >>= 31;
89*8617a60dSAndroid Build Coastguard Worker 	}
90*8617a60dSAndroid Build Coastguard Worker 
91*8617a60dSAndroid Build Coastguard Worker 	/* To keep the code running in constant-time for side-channel
92*8617a60dSAndroid Build Coastguard Worker 	 * resistance, D − E + mod is systematically computed even if we do not
93*8617a60dSAndroid Build Coastguard Worker 	 * need it. */
94*8617a60dSAndroid Build Coastguard Worker 	for (i = 0; i < key->arrsize; i++) {
95*8617a60dSAndroid Build Coastguard Worker 		de_i = (uint64_t *)&de[i];
96*8617a60dSAndroid Build Coastguard Worker 		sum = de_i[0] + key->n[i] + carry;
97*8617a60dSAndroid Build Coastguard Worker 		carry = sum >> 32;
98*8617a60dSAndroid Build Coastguard Worker 
99*8617a60dSAndroid Build Coastguard Worker 		/* Use de_i[1] as temporary storage. */
100*8617a60dSAndroid Build Coastguard Worker 		de_i[1] = (uint32_t)sum;
101*8617a60dSAndroid Build Coastguard Worker 	}
102*8617a60dSAndroid Build Coastguard Worker 
103*8617a60dSAndroid Build Coastguard Worker 	int index = borrow ? 1 : 0;
104*8617a60dSAndroid Build Coastguard Worker 	for (i = 0; i < key->arrsize; i++) {
105*8617a60dSAndroid Build Coastguard Worker 		de_i = (uint64_t *)&de[i];
106*8617a60dSAndroid Build Coastguard Worker 		c[i] = (uint32_t)de_i[index];
107*8617a60dSAndroid Build Coastguard Worker 	}
108*8617a60dSAndroid Build Coastguard Worker }
109*8617a60dSAndroid Build Coastguard Worker 
110*8617a60dSAndroid Build Coastguard Worker /**
111*8617a60dSAndroid Build Coastguard Worker  * Montgomery c[] = a[] * b[] / R % mod
112*8617a60dSAndroid Build Coastguard Worker  */
mont_mult(const struct vb2_public_key * key,uint32_t * c,const uint32_t * a,const uint32_t * b,const uint32_t mu,vb2_m128i * de,vb2_m128i * b_modulus)113*8617a60dSAndroid Build Coastguard Worker static void mont_mult(const struct vb2_public_key *key,
114*8617a60dSAndroid Build Coastguard Worker 		      uint32_t *c,
115*8617a60dSAndroid Build Coastguard Worker 		      const uint32_t *a,
116*8617a60dSAndroid Build Coastguard Worker 		      const uint32_t *b,
117*8617a60dSAndroid Build Coastguard Worker 		      const uint32_t mu,
118*8617a60dSAndroid Build Coastguard Worker 		      vb2_m128i *de,
119*8617a60dSAndroid Build Coastguard Worker 		      vb2_m128i *b_modulus)
120*8617a60dSAndroid Build Coastguard Worker {
121*8617a60dSAndroid Build Coastguard Worker 	const uint32_t mub0 = mu * b[0];
122*8617a60dSAndroid Build Coastguard Worker 	const vb2_m128i mask = vb2_set_epi32(0,  0xffffffff, 0, 0xffffffff);
123*8617a60dSAndroid Build Coastguard Worker 	const uint64_t *de0 = (uint64_t *)de;
124*8617a60dSAndroid Build Coastguard Worker 	uint32_t i, j, q, muc0;
125*8617a60dSAndroid Build Coastguard Worker 	vb2_m128i p01, t01, mul;
126*8617a60dSAndroid Build Coastguard Worker 
127*8617a60dSAndroid Build Coastguard Worker 	for (i = 0; i < key->arrsize; i++) {
128*8617a60dSAndroid Build Coastguard Worker 		b_modulus[i] = vb2_set_epi32(0, b[i], 0, key->n[i]);
129*8617a60dSAndroid Build Coastguard Worker 		de[i] = vb2_setzero_si128();
130*8617a60dSAndroid Build Coastguard Worker 	}
131*8617a60dSAndroid Build Coastguard Worker 
132*8617a60dSAndroid Build Coastguard Worker 	for (j = 0; j < key->arrsize; j++) {
133*8617a60dSAndroid Build Coastguard Worker 		c[0] = (uint32_t)de0[1] - de0[0];
134*8617a60dSAndroid Build Coastguard Worker 		muc0 = mu * c[0];
135*8617a60dSAndroid Build Coastguard Worker 
136*8617a60dSAndroid Build Coastguard Worker 		q = muc0 + mub0 * a[j];
137*8617a60dSAndroid Build Coastguard Worker 
138*8617a60dSAndroid Build Coastguard Worker 		mul = vb2_set_epi32(0, a[j], 0, q);
139*8617a60dSAndroid Build Coastguard Worker 
140*8617a60dSAndroid Build Coastguard Worker 		p01 = vb2_add_epi64(de[0], vb2_mul_epu32(mul, b_modulus[0]));
141*8617a60dSAndroid Build Coastguard Worker 
142*8617a60dSAndroid Build Coastguard Worker 		t01 = vb2_srli_epi64(p01, 32);
143*8617a60dSAndroid Build Coastguard Worker 
144*8617a60dSAndroid Build Coastguard Worker 		for (i = 1; i < key->arrsize; i++) {
145*8617a60dSAndroid Build Coastguard Worker 			p01 = vb2_add_epi64(vb2_add_epi64(t01, de[i]),
146*8617a60dSAndroid Build Coastguard Worker 					    vb2_mul_epu32(mul, b_modulus[i]));
147*8617a60dSAndroid Build Coastguard Worker 
148*8617a60dSAndroid Build Coastguard Worker 			t01 = vb2_srli_epi64(p01, 32);
149*8617a60dSAndroid Build Coastguard Worker 
150*8617a60dSAndroid Build Coastguard Worker 			de[i - 1] = vb2_and_si128(mask, p01);
151*8617a60dSAndroid Build Coastguard Worker 		}
152*8617a60dSAndroid Build Coastguard Worker 
153*8617a60dSAndroid Build Coastguard Worker 		de[key->arrsize - 1] = t01;
154*8617a60dSAndroid Build Coastguard Worker 	}
155*8617a60dSAndroid Build Coastguard Worker 
156*8617a60dSAndroid Build Coastguard Worker 	sub_mod(key, de, c);
157*8617a60dSAndroid Build Coastguard Worker }
158*8617a60dSAndroid Build Coastguard Worker 
swap_endianness(const uint32_t * in,uint32_t * out,size_t size)159*8617a60dSAndroid Build Coastguard Worker static void swap_endianness(const uint32_t *in, uint32_t *out, size_t size)
160*8617a60dSAndroid Build Coastguard Worker {
161*8617a60dSAndroid Build Coastguard Worker 	size_t i;
162*8617a60dSAndroid Build Coastguard Worker 
163*8617a60dSAndroid Build Coastguard Worker 	for (i = 0; i < size; i++)
164*8617a60dSAndroid Build Coastguard Worker 		out[i] = __builtin_bswap32(in[size - 1 - i]);
165*8617a60dSAndroid Build Coastguard Worker }
166*8617a60dSAndroid Build Coastguard Worker 
vb2ex_hwcrypto_modexp(const struct vb2_public_key * key,uint8_t * inout,void * workbuf,size_t workbuf_size,int exp)167*8617a60dSAndroid Build Coastguard Worker vb2_error_t vb2ex_hwcrypto_modexp(const struct vb2_public_key *key,
168*8617a60dSAndroid Build Coastguard Worker 				  uint8_t *inout, void *workbuf,
169*8617a60dSAndroid Build Coastguard Worker 				  size_t workbuf_size, int exp)
170*8617a60dSAndroid Build Coastguard Worker {
171*8617a60dSAndroid Build Coastguard Worker 	const uint32_t mu = -key->n0inv;
172*8617a60dSAndroid Build Coastguard Worker 	uint32_t *a = workbuf;
173*8617a60dSAndroid Build Coastguard Worker 	uint32_t *aR = a + key->arrsize;
174*8617a60dSAndroid Build Coastguard Worker 	uint32_t *aaR = aR + key->arrsize;
175*8617a60dSAndroid Build Coastguard Worker 	uint32_t *aaa = aaR;  /* Re-use location. */
176*8617a60dSAndroid Build Coastguard Worker 	vb2_m128i *de = (vb2_m128i *)(((uintptr_t)(aaa + key->arrsize) + 0xf) & ~0xf);
177*8617a60dSAndroid Build Coastguard Worker 	vb2_m128i *b_modulus = de + key->arrsize;
178*8617a60dSAndroid Build Coastguard Worker 	size_t i;
179*8617a60dSAndroid Build Coastguard Worker 
180*8617a60dSAndroid Build Coastguard Worker 	if ((void *)&b_modulus[key->arrsize] - workbuf > workbuf_size) {
181*8617a60dSAndroid Build Coastguard Worker 		VB2_DEBUG("ERROR - HW modexp work buffer too small!\n");
182*8617a60dSAndroid Build Coastguard Worker 		return VB2_ERROR_WORKBUF_SMALL;
183*8617a60dSAndroid Build Coastguard Worker 	}
184*8617a60dSAndroid Build Coastguard Worker 
185*8617a60dSAndroid Build Coastguard Worker 	/* Convert big endian to little endian. */
186*8617a60dSAndroid Build Coastguard Worker 	swap_endianness((uint32_t *)inout, a, key->arrsize);
187*8617a60dSAndroid Build Coastguard Worker 
188*8617a60dSAndroid Build Coastguard Worker 	/* aR = a * RR / R mod M  */
189*8617a60dSAndroid Build Coastguard Worker 	mont_mult(key, aR, a, key->rr, mu, de, b_modulus);
190*8617a60dSAndroid Build Coastguard Worker 	if (exp == 3) {
191*8617a60dSAndroid Build Coastguard Worker 		/* aaR = aR * aR / R mod M */
192*8617a60dSAndroid Build Coastguard Worker 		mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
193*8617a60dSAndroid Build Coastguard Worker 		/* a = aaR * aR / R mod M */
194*8617a60dSAndroid Build Coastguard Worker 		mont_mult(key, a, aaR, aR, mu, de, b_modulus);
195*8617a60dSAndroid Build Coastguard Worker 
196*8617a60dSAndroid Build Coastguard Worker 		/* To multiply with 1, prepare aR with first element 1 and
197*8617a60dSAndroid Build Coastguard Worker 		 * others as 0. */
198*8617a60dSAndroid Build Coastguard Worker 		aR[0] = 1;
199*8617a60dSAndroid Build Coastguard Worker 		for (i = 1; i < key->arrsize; i++)
200*8617a60dSAndroid Build Coastguard Worker 			aR[i] = 0;
201*8617a60dSAndroid Build Coastguard Worker 
202*8617a60dSAndroid Build Coastguard Worker 		/* aaa = a * aR / R mod M = a * 1 / R mod M*/
203*8617a60dSAndroid Build Coastguard Worker 		mont_mult(key, aaa, a, aR, mu, de, b_modulus);
204*8617a60dSAndroid Build Coastguard Worker 	} else {
205*8617a60dSAndroid Build Coastguard Worker 		/* Exponent 65537 */
206*8617a60dSAndroid Build Coastguard Worker 		for (i = 0; i < 16; i += 2) {
207*8617a60dSAndroid Build Coastguard Worker 			/* aaR = aR * aR / R mod M */
208*8617a60dSAndroid Build Coastguard Worker 			mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
209*8617a60dSAndroid Build Coastguard Worker 			/* aR = aaR * aaR / R mod M */
210*8617a60dSAndroid Build Coastguard Worker 			mont_mult(key, aR, aaR, aaR, mu, de, b_modulus);
211*8617a60dSAndroid Build Coastguard Worker 		}
212*8617a60dSAndroid Build Coastguard Worker 		/* aaa = aR * a / R mod M */
213*8617a60dSAndroid Build Coastguard Worker 		mont_mult(key, aaa, aR, a, mu, de, b_modulus);
214*8617a60dSAndroid Build Coastguard Worker 	}
215*8617a60dSAndroid Build Coastguard Worker 
216*8617a60dSAndroid Build Coastguard Worker 	/* Convert little endian to big endian. */
217*8617a60dSAndroid Build Coastguard Worker 	swap_endianness(aaa, (uint32_t *)inout, key->arrsize);
218*8617a60dSAndroid Build Coastguard Worker 
219*8617a60dSAndroid Build Coastguard Worker 	return VB2_SUCCESS;
220*8617a60dSAndroid Build Coastguard Worker }
221