1 /* Copyright 2023 The ChromiumOS Authors
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 *
5 * Authors: Muhammad Monir Hossain <[email protected]>
6 * Jeremy Compostella <[email protected]>
7 */
8
9 /*
10 * The algorithm implemented below is described in Montgomery Multiplication
11 * Using Vector Instructions document from Microsoft Research, August 20, 2013
12 * (cf. https://eprint.iacr.org/2013/519.pdf).
13 *
14 * This implementation leverages SSE2 instructions to perform arithmetic
15 * operations in parallel.
16 *
17 * This algorithm uses the modulus positive inverse (1 / N mod 2^32) which can
18 * be easily computed from the modulus negative inverse provided by the public
19 * key data structure `n0inv' field.
20 */
21
22 #include "2api.h"
23 #include "2common.h"
24 #include "2return_codes.h"
25 #include "2rsa.h"
26
27 typedef long long vb2_m128i __attribute__((__vector_size__(16), __may_alias__));
28 typedef int vb2_v4si __attribute__((__vector_size__(16)));
29 typedef unsigned long long vb2_v2du __attribute__((__vector_size__(16)));
30
31 static inline vb2_m128i __attribute__((__always_inline__))
vb2_set_epi32(int q3,int q2,int q1,int q0)32 vb2_set_epi32 (int q3, int q2, int q1, int q0)
33 {
34 return (vb2_m128i)(vb2_v4si){ q0, q1, q2, q3 };
35 }
36
37 static inline vb2_m128i __attribute__((__always_inline__))
vb2_setzero_si128(void)38 vb2_setzero_si128 (void)
39 {
40 return (vb2_m128i)(vb2_v4si){ 0, 0, 0, 0 };
41 }
42
43 static inline vb2_m128i __attribute__((__always_inline__))
vb2_add_epi64(vb2_m128i a,vb2_m128i b)44 vb2_add_epi64 (vb2_m128i a, vb2_m128i b)
45 {
46 return (vb2_m128i)((vb2_v2du)a + (vb2_v2du)b);
47 }
48
49 static inline vb2_m128i __attribute__((__always_inline__))
vb2_srli_epi64(vb2_m128i a,int b)50 vb2_srli_epi64 (vb2_m128i a, int b)
51 {
52 return (vb2_m128i)__builtin_ia32_psrlqi128(a, b);
53 }
54
55 static inline vb2_m128i __attribute__((__always_inline__))
vb2_mul_epu32(vb2_m128i a,vb2_m128i b)56 vb2_mul_epu32 (vb2_m128i a, vb2_m128i b)
57 {
58 return (vb2_m128i)__builtin_ia32_pmuludq128((vb2_v4si)a, (vb2_v4si)b);
59 }
60
61 static inline vb2_m128i __attribute__((__always_inline__))
vb2_and_si128(vb2_m128i a,vb2_m128i b)62 vb2_and_si128 (vb2_m128i a, vb2_m128i b)
63 {
64 return (vb2_m128i)((vb2_v2du)a & (vb2_v2du)b);
65 }
66
67 /**
68 * Montgomery c[] = d[] - e[] if d[] > e[], c[] = d[] - e[] + mod[] otherwise.
69 *
70 * de[] has d[] in lower 64 bits (effectively lower 32 bits) and e[] in upper
71 * 64 bits (effectively lower 32 bits)
72 * de[] is used as a temporary buffer and therefore its content will be lost.
73 */
sub_mod(const struct vb2_public_key * key,vb2_m128i * de,uint32_t * c)74 static void sub_mod(const struct vb2_public_key *key, vb2_m128i *de, uint32_t *c)
75 {
76 uint32_t i, borrow = 0, carry = 0, d, e;
77 uint64_t sum, *de_i;
78
79 for (i = 0; i < key->arrsize; i++) {
80 de_i = (uint64_t *)&de[i];
81 d = (uint32_t)de_i[1];
82 e = (uint32_t)de_i[0];
83
84 /* Use de_i[0] as temporary storage of d[] - e[]. */
85 de_i[0] = (uint32_t)d - e - borrow;
86
87 borrow = d ^ ((d ^ e) | (d ^ (uint32_t)de_i[0]));
88 borrow >>= 31;
89 }
90
91 /* To keep the code running in constant-time for side-channel
92 * resistance, D − E + mod is systematically computed even if we do not
93 * need it. */
94 for (i = 0; i < key->arrsize; i++) {
95 de_i = (uint64_t *)&de[i];
96 sum = de_i[0] + key->n[i] + carry;
97 carry = sum >> 32;
98
99 /* Use de_i[1] as temporary storage. */
100 de_i[1] = (uint32_t)sum;
101 }
102
103 int index = borrow ? 1 : 0;
104 for (i = 0; i < key->arrsize; i++) {
105 de_i = (uint64_t *)&de[i];
106 c[i] = (uint32_t)de_i[index];
107 }
108 }
109
110 /**
111 * Montgomery c[] = a[] * b[] / R % mod
112 */
mont_mult(const struct vb2_public_key * key,uint32_t * c,const uint32_t * a,const uint32_t * b,const uint32_t mu,vb2_m128i * de,vb2_m128i * b_modulus)113 static void mont_mult(const struct vb2_public_key *key,
114 uint32_t *c,
115 const uint32_t *a,
116 const uint32_t *b,
117 const uint32_t mu,
118 vb2_m128i *de,
119 vb2_m128i *b_modulus)
120 {
121 const uint32_t mub0 = mu * b[0];
122 const vb2_m128i mask = vb2_set_epi32(0, 0xffffffff, 0, 0xffffffff);
123 const uint64_t *de0 = (uint64_t *)de;
124 uint32_t i, j, q, muc0;
125 vb2_m128i p01, t01, mul;
126
127 for (i = 0; i < key->arrsize; i++) {
128 b_modulus[i] = vb2_set_epi32(0, b[i], 0, key->n[i]);
129 de[i] = vb2_setzero_si128();
130 }
131
132 for (j = 0; j < key->arrsize; j++) {
133 c[0] = (uint32_t)de0[1] - de0[0];
134 muc0 = mu * c[0];
135
136 q = muc0 + mub0 * a[j];
137
138 mul = vb2_set_epi32(0, a[j], 0, q);
139
140 p01 = vb2_add_epi64(de[0], vb2_mul_epu32(mul, b_modulus[0]));
141
142 t01 = vb2_srli_epi64(p01, 32);
143
144 for (i = 1; i < key->arrsize; i++) {
145 p01 = vb2_add_epi64(vb2_add_epi64(t01, de[i]),
146 vb2_mul_epu32(mul, b_modulus[i]));
147
148 t01 = vb2_srli_epi64(p01, 32);
149
150 de[i - 1] = vb2_and_si128(mask, p01);
151 }
152
153 de[key->arrsize - 1] = t01;
154 }
155
156 sub_mod(key, de, c);
157 }
158
swap_endianness(const uint32_t * in,uint32_t * out,size_t size)159 static void swap_endianness(const uint32_t *in, uint32_t *out, size_t size)
160 {
161 size_t i;
162
163 for (i = 0; i < size; i++)
164 out[i] = __builtin_bswap32(in[size - 1 - i]);
165 }
166
vb2ex_hwcrypto_modexp(const struct vb2_public_key * key,uint8_t * inout,void * workbuf,size_t workbuf_size,int exp)167 vb2_error_t vb2ex_hwcrypto_modexp(const struct vb2_public_key *key,
168 uint8_t *inout, void *workbuf,
169 size_t workbuf_size, int exp)
170 {
171 const uint32_t mu = -key->n0inv;
172 uint32_t *a = workbuf;
173 uint32_t *aR = a + key->arrsize;
174 uint32_t *aaR = aR + key->arrsize;
175 uint32_t *aaa = aaR; /* Re-use location. */
176 vb2_m128i *de = (vb2_m128i *)(((uintptr_t)(aaa + key->arrsize) + 0xf) & ~0xf);
177 vb2_m128i *b_modulus = de + key->arrsize;
178 size_t i;
179
180 if ((void *)&b_modulus[key->arrsize] - workbuf > workbuf_size) {
181 VB2_DEBUG("ERROR - HW modexp work buffer too small!\n");
182 return VB2_ERROR_WORKBUF_SMALL;
183 }
184
185 /* Convert big endian to little endian. */
186 swap_endianness((uint32_t *)inout, a, key->arrsize);
187
188 /* aR = a * RR / R mod M */
189 mont_mult(key, aR, a, key->rr, mu, de, b_modulus);
190 if (exp == 3) {
191 /* aaR = aR * aR / R mod M */
192 mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
193 /* a = aaR * aR / R mod M */
194 mont_mult(key, a, aaR, aR, mu, de, b_modulus);
195
196 /* To multiply with 1, prepare aR with first element 1 and
197 * others as 0. */
198 aR[0] = 1;
199 for (i = 1; i < key->arrsize; i++)
200 aR[i] = 0;
201
202 /* aaa = a * aR / R mod M = a * 1 / R mod M*/
203 mont_mult(key, aaa, a, aR, mu, de, b_modulus);
204 } else {
205 /* Exponent 65537 */
206 for (i = 0; i < 16; i += 2) {
207 /* aaR = aR * aR / R mod M */
208 mont_mult(key, aaR, aR, aR, mu, de, b_modulus);
209 /* aR = aaR * aaR / R mod M */
210 mont_mult(key, aR, aaR, aaR, mu, de, b_modulus);
211 }
212 /* aaa = aR * a / R mod M */
213 mont_mult(key, aaa, aR, a, mu, de, b_modulus);
214 }
215
216 /* Convert little endian to big endian. */
217 swap_endianness(aaa, (uint32_t *)inout, key->arrsize);
218
219 return VB2_SUCCESS;
220 }
221