xref: /aosp_15_r20/external/vboot_reference/firmware/2lib/2modpow_sse2.c (revision 8617a60d3594060b7ecbd21bc622a7c14f3cf2bc)
1 /* Copyright 2023 The ChromiumOS Authors
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  *
5  * Authors: Muhammad Monir Hossain <[email protected]>
6  *          Jeremy Compostella <[email protected]>
7  */
8 
9 /*
10  * The algorithm implemented below is described in Montgomery Multiplication
11  * Using Vector Instructions document from Microsoft Research, August 20, 2013
12  * (cf. https://eprint.iacr.org/2013/519.pdf).
13  *
14  * This implementation leverages SSE2 instructions to perform arithmetic
15  * operations in parallel.
16  *
17  * This algorithm uses the modulus positive inverse (1 / N mod 2^32) which can
18  * be easily computed from the modulus negative inverse provided by the public
19  * key data structure `n0inv' field.
20  */
21 
22 #include "2api.h"
23 #include "2common.h"
24 #include "2return_codes.h"
25 #include "2rsa.h"
26 
27 typedef long long vb2_m128i __attribute__((__vector_size__(16), __may_alias__));
28 typedef int vb2_v4si __attribute__((__vector_size__(16)));
29 typedef unsigned long long vb2_v2du __attribute__((__vector_size__(16)));
30 
31 static inline vb2_m128i __attribute__((__always_inline__))
vb2_set_epi32(int q3,int q2,int q1,int q0)32 vb2_set_epi32 (int q3, int q2, int q1, int q0)
33 {
34 	return (vb2_m128i)(vb2_v4si){ q0, q1, q2, q3 };
35 }
36 
37 static inline vb2_m128i __attribute__((__always_inline__))
vb2_setzero_si128(void)38 vb2_setzero_si128 (void)
39 {
40 	return (vb2_m128i)(vb2_v4si){ 0, 0, 0, 0 };
41 }
42 
43 static inline vb2_m128i __attribute__((__always_inline__))
vb2_add_epi64(vb2_m128i a,vb2_m128i b)44 vb2_add_epi64 (vb2_m128i a, vb2_m128i b)
45 {
46 	return (vb2_m128i)((vb2_v2du)a + (vb2_v2du)b);
47 }
48 
49 static inline vb2_m128i __attribute__((__always_inline__))
vb2_srli_epi64(vb2_m128i a,int b)50 vb2_srli_epi64 (vb2_m128i a, int b)
51 {
52 	return (vb2_m128i)__builtin_ia32_psrlqi128(a, b);
53 }
54 
55 static inline vb2_m128i __attribute__((__always_inline__))
vb2_mul_epu32(vb2_m128i a,vb2_m128i b)56 vb2_mul_epu32 (vb2_m128i a, vb2_m128i b)
57 {
58 	return (vb2_m128i)__builtin_ia32_pmuludq128((vb2_v4si)a, (vb2_v4si)b);
59 }
60 
61 static inline vb2_m128i __attribute__((__always_inline__))
vb2_and_si128(vb2_m128i a,vb2_m128i b)62 vb2_and_si128 (vb2_m128i a, vb2_m128i b)
63 {
64 	return (vb2_m128i)((vb2_v2du)a & (vb2_v2du)b);
65 }
66 
67 /**
68  * Montgomery c[] = d[] - e[] if d[] > e[], c[] = d[] - e[] + mod[] otherwise.
69  *
70  * de[] has d[] in lower 64 bits (effectively lower 32 bits) and e[] in upper
71  * 64 bits (effectively lower 32 bits)
72  * de[] is used as a temporary buffer and therefore its content will be lost.
73  */
sub_mod(const struct vb2_public_key * key,vb2_m128i * de,uint32_t * c)74 static void sub_mod(const struct vb2_public_key *key, vb2_m128i *de, uint32_t *c)
75 {
76 	uint32_t i, borrow = 0, carry = 0, d, e;
77 	uint64_t sum, *de_i;
78 
79 	for (i = 0; i < key->arrsize; i++) {
80 		de_i = (uint64_t *)&de[i];
81 		d = (uint32_t)de_i[1];
82 		e = (uint32_t)de_i[0];
83 
84 		/* Use de_i[0] as temporary storage of d[] - e[]. */
85 		de_i[0] = (uint32_t)d - e - borrow;
86 
87 		borrow = d ^ ((d ^ e) | (d ^ (uint32_t)de_i[0]));
88 		borrow >>= 31;
89 	}
90 
91 	/* To keep the code running in constant-time for side-channel
92 	 * resistance, D − E + mod is systematically computed even if we do not
93 	 * need it. */
94 	for (i = 0; i < key->arrsize; i++) {
95 		de_i = (uint64_t *)&de[i];
96 		sum = de_i[0] + key->n[i] + carry;
97 		carry = sum >> 32;
98 
99 		/* Use de_i[1] as temporary storage. */
100 		de_i[1] = (uint32_t)sum;
101 	}
102 
103 	int index = borrow ? 1 : 0;
104 	for (i = 0; i < key->arrsize; i++) {
105 		de_i = (uint64_t *)&de[i];
106 		c[i] = (uint32_t)de_i[index];
107 	}
108 }
109 
110 /**
111  * Montgomery c[] = a[] * b[] / R % mod
112  */
mont_mult(const struct vb2_public_key * key,uint32_t * c,const uint32_t * a,const uint32_t * b,const uint32_t mu,vb2_m128i * de,vb2_m128i * b_modulus)113 static void mont_mult(const struct vb2_public_key *key,
114 		      uint32_t *c,
115 		      const uint32_t *a,
116 		      const uint32_t *b,
117 		      const uint32_t mu,
118 		      vb2_m128i *de,
119 		      vb2_m128i *b_modulus)
120 {
121 	const uint32_t mub0 = mu * b[0];
122 	const vb2_m128i mask = vb2_set_epi32(0,  0xffffffff, 0, 0xffffffff);
123 	const uint64_t *de0 = (uint64_t *)de;
124 	uint32_t i, j, q, muc0;
125 	vb2_m128i p01, t01, mul;
126 
127 	for (i = 0; i < key->arrsize; i++) {
128 		b_modulus[i] = vb2_set_epi32(0, b[i], 0, key->n[i]);
129 		de[i] = vb2_setzero_si128();
130 	}
131 
132 	for (j = 0; j < key->arrsize; j++) {
133 		c[0] = (uint32_t)de0[1] - de0[0];
134 		muc0 = mu * c[0];
135 
136 		q = muc0 + mub0 * a[j];
137 
138 		mul = vb2_set_epi32(0, a[j], 0, q);
139 
140 		p01 = vb2_add_epi64(de[0], vb2_mul_epu32(mul, b_modulus[0]));
141 
142 		t01 = vb2_srli_epi64(p01, 32);
143 
144 		for (i = 1; i < key->arrsize; i++) {
145 			p01 = vb2_add_epi64(vb2_add_epi64(t01, de[i]),
146 					    vb2_mul_epu32(mul, b_modulus[i]));
147 
148 			t01 = vb2_srli_epi64(p01, 32);
149 
150 			de[i - 1] = vb2_and_si128(mask, p01);
151 		}
152 
153 		de[key->arrsize - 1] = t01;
154 	}
155 
156 	sub_mod(key, de, c);
157 }
158 
swap_endianness(const uint32_t * in,uint32_t * out,size_t size)159 static void swap_endianness(const uint32_t *in, uint32_t *out, size_t size)
160 {
161 	size_t i;
162 
163 	for (i = 0; i < size; i++)
164 		out[i] = __builtin_bswap32(in[size - 1 - i]);
165 }
166 
vb2ex_hwcrypto_modexp(const struct vb2_public_key * key,uint8_t * inout,void * workbuf,size_t workbuf_size,int exp)167 vb2_error_t vb2ex_hwcrypto_modexp(const struct vb2_public_key *key,
168 				  uint8_t *inout, void *workbuf,
169 				  size_t workbuf_size, int exp)
170 {
171 	const uint32_t mu = -key->n0inv;
172 	uint32_t *a = workbuf;
173 	uint32_t *aR = a + key->arrsize;
174 	uint32_t *aaR = aR + key->arrsize;
175 	uint32_t *aaa = aaR;  /* Re-use location. */
176 	vb2_m128i *de = (vb2_m128i *)(((uintptr_t)(aaa + key->arrsize) + 0xf) & ~0xf);
177 	vb2_m128i *b_modulus = de + key->arrsize;
178 	size_t i;
179 
180 	if ((void *)&b_modulus[key->arrsize] - workbuf > workbuf_size) {
181 		VB2_DEBUG("ERROR - HW modexp work buffer too small!\n");
182 		return VB2_ERROR_WORKBUF_SMALL;
183 	}
184 
185 	/* Convert big endian to little endian. */
186 	swap_endianness((uint32_t *)inout, a, key->arrsize);
187 
188 	/* aR = a * RR / R mod M  */
189 	mont_mult(key, aR, a, key->rr, mu, de, b_modulus);
190 	if (exp == 3) {
191 		/* aaR = aR * aR / R mod M */
192 		mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
193 		/* a = aaR * aR / R mod M */
194 		mont_mult(key, a, aaR, aR, mu, de, b_modulus);
195 
196 		/* To multiply with 1, prepare aR with first element 1 and
197 		 * others as 0. */
198 		aR[0] = 1;
199 		for (i = 1; i < key->arrsize; i++)
200 			aR[i] = 0;
201 
202 		/* aaa = a * aR / R mod M = a * 1 / R mod M*/
203 		mont_mult(key, aaa, a, aR, mu, de, b_modulus);
204 	} else {
205 		/* Exponent 65537 */
206 		for (i = 0; i < 16; i += 2) {
207 			/* aaR = aR * aR / R mod M */
208 			mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
209 			/* aR = aaR * aaR / R mod M */
210 			mont_mult(key, aR, aaR, aaR, mu, de, b_modulus);
211 		}
212 		/* aaa = aR * a / R mod M */
213 		mont_mult(key, aaa, aR, a, mu, de, b_modulus);
214 	}
215 
216 	/* Convert little endian to big endian. */
217 	swap_endianness(aaa, (uint32_t *)inout, key->arrsize);
218 
219 	return VB2_SUCCESS;
220 }
221