xref: /aosp_15_r20/external/lzma/C/Sha1Opt.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
2 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7 
8 // #define Z7_USE_HW_SHA_STUB // for debug
9 #ifdef MY_CPU_X86_OR_AMD64
10   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
11       #define USE_HW_SHA
12   #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
13      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
14      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
15       #define USE_HW_SHA
16       #if !defined(__INTEL_COMPILER)
17       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18       #if !defined(__SHA__) || !defined(__SSSE3__)
19         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20       #endif
21       #endif
22   #elif defined(_MSC_VER)
23     #if (_MSC_VER >= 1900)
24       #define USE_HW_SHA
25     #else
26       #define Z7_USE_HW_SHA_STUB
27     #endif
28   #endif
29 // #endif // MY_CPU_X86_OR_AMD64
30 #ifndef USE_HW_SHA
31   // #define Z7_USE_HW_SHA_STUB // for debug
32 #endif
33 
34 #ifdef USE_HW_SHA
35 
36 // #pragma message("Sha1 HW")
37 
38 
39 
40 
41 // sse/sse2/ssse3:
42 #include <tmmintrin.h>
43 // sha*:
44 #include <immintrin.h>
45 
46 #if defined (__clang__) && defined(_MSC_VER)
47   #if !defined(__SHA__)
48     #include <shaintrin.h>
49   #endif
50 #else
51 
52 #endif
53 
54 /*
55 SHA1 uses:
56 SSE2:
57   _mm_loadu_si128
58   _mm_storeu_si128
59   _mm_set_epi32
60   _mm_add_epi32
61   _mm_shuffle_epi32 / pshufd
62   _mm_xor_si128
63   _mm_cvtsi128_si32
64   _mm_cvtsi32_si128
65 SSSE3:
66   _mm_shuffle_epi8 / pshufb
67 
68 SHA:
69   _mm_sha1*
70 */
71 
72 #define XOR_SI128(dest, src)      dest = _mm_xor_si128(dest, src);
73 #define SHUFFLE_EPI8(dest, mask)  dest = _mm_shuffle_epi8(dest, mask);
74 #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
75 #ifdef __clang__
76 #define SHA1_RNDS4_RET_TYPE_CAST (__m128i)
77 #else
78 #define SHA1_RNDS4_RET_TYPE_CAST
79 #endif
80 #define SHA1_RND4(abcd, e0, f)    abcd = SHA1_RNDS4_RET_TYPE_CAST _mm_sha1rnds4_epu32(abcd, e0, f);
81 #define SHA1_NEXTE(e, m)          e = _mm_sha1nexte_epu32(e, m);
82 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
83 #define SHA1_MSG1(dest, src)      dest = _mm_sha1msg1_epu32(dest, src);
84 #define SHA1_MSG2(dest, src)      dest = _mm_sha1msg2_epu32(dest, src);
85 
86 #define LOAD_SHUFFLE(m, k) \
87     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
88     SHUFFLE_EPI8(m, mask) \
89 
90 #define NNN(m0, m1, m2, m3)
91 
92 #define SM1(m0, m1, m2, m3) \
93     SHA1_MSG1(m0, m1) \
94 
95 #define SM2(m0, m1, m2, m3) \
96     XOR_SI128(m3, m1) \
97     SHA1_MSG2(m3, m2) \
98 
99 #define SM3(m0, m1, m2, m3) \
100     XOR_SI128(m3, m1) \
101     SM1(m0, m1, m2, m3) \
102     SHA1_MSG2(m3, m2) \
103 
104 #define R4(k, m0, m1, m2, m3, e0, e1, OP) \
105     e1 = abcd; \
106     SHA1_RND4(abcd, e0, (k) / 5) \
107     SHA1_NEXTE(e1, m1) \
108     OP(m0, m1, m2, m3) \
109 
110 
111 
112 #define R16(k, mx, OP0, OP1, OP2, OP3) \
113     R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
114     R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
115     R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
116     R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
117 
118 #define PREPARE_STATE \
119     SHUFFLE_EPI32 (abcd, 0x1B) \
120     SHUFFLE_EPI32 (e0,   0x1B) \
121 
122 
123 
124 
125 
126 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
127 #ifdef ATTRIB_SHA
128 ATTRIB_SHA
129 #endif
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)130 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
131 {
132   const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
133 
134 
135   __m128i abcd, e0;
136 
137   if (numBlocks == 0)
138     return;
139 
140   abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca
141   e0 = _mm_cvtsi32_si128((int)state[4]); // 000e
142 
143   PREPARE_STATE
144 
145   do
146   {
147     __m128i abcd_save, e2;
148     __m128i m0, m1, m2, m3;
149     __m128i e1;
150 
151 
152     abcd_save = abcd;
153     e2 = e0;
154 
155     LOAD_SHUFFLE (m0, 0)
156     LOAD_SHUFFLE (m1, 1)
157     LOAD_SHUFFLE (m2, 2)
158     LOAD_SHUFFLE (m3, 3)
159 
160     ADD_EPI32(e0, m0)
161 
162     R16 ( 0, m0, SM1, SM3, SM3, SM3 )
163     R16 ( 1, m0, SM3, SM3, SM3, SM3 )
164     R16 ( 2, m0, SM3, SM3, SM3, SM3 )
165     R16 ( 3, m0, SM3, SM3, SM3, SM3 )
166     R16 ( 4, e2, SM2, NNN, NNN, NNN )
167 
168     ADD_EPI32(abcd, abcd_save)
169 
170     data += 64;
171   }
172   while (--numBlocks);
173 
174   PREPARE_STATE
175 
176   _mm_storeu_si128((__m128i *) (void *) state, abcd);
177   *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
178 }
179 
180 #endif // USE_HW_SHA
181 
182 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
183    && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
184   #if   defined(__ARM_FEATURE_SHA2) \
185      || defined(__ARM_FEATURE_CRYPTO)
186     #define USE_HW_SHA
187   #else
188     #if  defined(MY_CPU_ARM64) \
189       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
190       || defined(Z7_MSC_VER_ORIGINAL)
191     #if  defined(__ARM_FP) && \
192           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
193            || defined(__GNUC__) && (__GNUC__ >= 6) \
194           ) \
195       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
196     #if  defined(MY_CPU_ARM64) \
197       || !defined(Z7_CLANG_VERSION) \
198       || defined(__ARM_NEON) && \
199           (Z7_CLANG_VERSION < 170000 || \
200            Z7_CLANG_VERSION > 170001)
201       #define USE_HW_SHA
202     #endif
203     #endif
204     #endif
205   #endif
206 
207 #ifdef USE_HW_SHA
208 
209 // #pragma message("=== Sha1 HW === ")
210 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2
211 
212 #if defined(__clang__) || defined(__GNUC__)
213 #if !defined(__ARM_FEATURE_SHA2) && \
214     !defined(__ARM_FEATURE_CRYPTO)
215   #ifdef MY_CPU_ARM64
216 #if defined(__clang__)
217     #define ATTRIB_SHA __attribute__((__target__("crypto")))
218 #else
219     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
220 #endif
221   #else
222 #if defined(__clang__) && (__clang_major__ >= 1)
223     #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
224 #else
225     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
226 #endif
227   #endif
228 #endif
229 #else
230   // _MSC_VER
231   // for arm32
232   #define _ARM_USE_NEW_NEON_INTRINSICS
233 #endif
234 
235 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
236 #include <arm64_neon.h>
237 #else
238 
239 #if defined(__clang__) && __clang_major__ < 16
240 #if !defined(__ARM_FEATURE_SHA2) && \
241     !defined(__ARM_FEATURE_CRYPTO)
242 //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
243     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
244     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
245 // #if defined(__clang__) && __clang_major__ < 13
246     #define __ARM_FEATURE_CRYPTO 1
247 // #else
248     #define __ARM_FEATURE_SHA2 1
249 // #endif
250     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
251 #endif
252 #endif // clang
253 
254 #if defined(__clang__)
255 
256 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
257     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
258 //    #pragma message("#define __ARM_ARCH 8")
259     #undef  __ARM_ARCH
260     #define __ARM_ARCH 8
261     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
262 #endif
263 
264 #endif // clang
265 
266 #include <arm_neon.h>
267 
268 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
269     defined(__ARM_FEATURE_CRYPTO) && \
270     defined(__ARM_FEATURE_SHA2)
271 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
272     #undef __ARM_FEATURE_CRYPTO
273     #undef __ARM_FEATURE_SHA2
274     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
275 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
276 //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
277 #endif
278 
279 #endif // Z7_MSC_VER_ORIGINAL
280 
281 typedef uint32x4_t v128;
282 // typedef __n128 v128; // MSVC
283 // the bug in clang 3.8.1:
284 // __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1);
285 #if defined(__clang__) && (__clang_major__ <= 9)
286 #pragma GCC diagnostic ignored "-Wvector-conversion"
287 #endif
288 
289 #ifdef MY_CPU_BE
290   #define MY_rev32_for_LE(x) x
291 #else
292   #define MY_rev32_for_LE(x) vrev32q_u8(x)
293 #endif
294 
295 #define LOAD_128_32(_p)       vld1q_u32(_p)
296 #define LOAD_128_8(_p)        vld1q_u8 (_p)
297 #define STORE_128_32(_p, _v)  vst1q_u32(_p, _v)
298 
299 #define LOAD_SHUFFLE(m, k) \
300     m = vreinterpretq_u32_u8( \
301         MY_rev32_for_LE( \
302         LOAD_128_8(data + (k) * 16))); \
303 
304 #define N0(dest, src2, src3)
305 #define N1(dest, src)
306 #define U0(dest, src2, src3)  dest = vsha1su0q_u32(dest, src2, src3);
307 #define U1(dest, src)         dest = vsha1su1q_u32(dest, src);
308 #define C(e)                  abcd = vsha1cq_u32(abcd, e, t)
309 #define P(e)                  abcd = vsha1pq_u32(abcd, e, t)
310 #define M(e)                  abcd = vsha1mq_u32(abcd, e, t)
311 #define H(e)                  e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
312 #define T(m, c)               t = vaddq_u32(m, c)
313 
314 #define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
315     T(m0, d0);  f0(m3, m0, m1)  z0(m2, m1)  H(e1);  w0(e0); \
316     T(m1, d1);  f1(m0, m1, m2)  z1(m3, m2)  H(e0);  w1(e1); \
317     T(m2, d2);  f2(m1, m2, m3)  z2(m0, m3)  H(e1);  w2(e0); \
318     T(m3, d3);  f3(m2, m3, m0)  z3(m1, m0)  H(e0);  w3(e1); \
319 
320 
321 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
322 #ifdef ATTRIB_SHA
323 ATTRIB_SHA
324 #endif
Sha1_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)325 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
326 {
327   v128 abcd;
328   v128 c0, c1, c2, c3;
329   uint32_t e0;
330 
331   if (numBlocks == 0)
332     return;
333 
334   c0 = vdupq_n_u32(0x5a827999);
335   c1 = vdupq_n_u32(0x6ed9eba1);
336   c2 = vdupq_n_u32(0x8f1bbcdc);
337   c3 = vdupq_n_u32(0xca62c1d6);
338 
339   abcd = LOAD_128_32(&state[0]);
340   e0 = state[4];
341 
342   do
343   {
344     v128 abcd_save;
345     v128 m0, m1, m2, m3;
346     v128 t;
347     uint32_t e0_save, e1;
348 
349     abcd_save = abcd;
350     e0_save = e0;
351 
352     LOAD_SHUFFLE (m0, 0)
353     LOAD_SHUFFLE (m1, 1)
354     LOAD_SHUFFLE (m2, 2)
355     LOAD_SHUFFLE (m3, 3)
356 
357     R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
358     R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
359     R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
360     R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
361     R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
362 
363     abcd = vaddq_u32(abcd, abcd_save);
364     e0 += e0_save;
365 
366     data += 64;
367   }
368   while (--numBlocks);
369 
370   STORE_128_32(&state[0], abcd);
371   state[4] = e0;
372 }
373 
374 #endif // USE_HW_SHA
375 
376 #endif // MY_CPU_ARM_OR_ARM64
377 
378 #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
379 // #error Stop_Compiling_UNSUPPORTED_SHA
380 // #include <stdlib.h>
381 // #include "Sha1.h"
382 // #if defined(_MSC_VER)
383 #pragma message("Sha1   HW-SW stub was used")
384 // #endif
385 void Z7_FASTCALL Sha1_UpdateBlocks   (UInt32 state[5], const Byte *data, size_t numBlocks);
386 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)387 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
388 {
389   Sha1_UpdateBlocks(state, data, numBlocks);
390   /*
391   UNUSED_VAR(state);
392   UNUSED_VAR(data);
393   UNUSED_VAR(numBlocks);
394   exit(1);
395   return;
396   */
397 }
398 #endif
399 
400 #undef U0
401 #undef U1
402 #undef N0
403 #undef N1
404 #undef C
405 #undef P
406 #undef M
407 #undef H
408 #undef T
409 #undef MY_rev32_for_LE
410 #undef NNN
411 #undef LOAD_128
412 #undef STORE_128
413 #undef LOAD_SHUFFLE
414 #undef SM1
415 #undef SM2
416 #undef SM3
417 #undef NNN
418 #undef R4
419 #undef R16
420 #undef PREPARE_STATE
421 #undef USE_HW_SHA
422 #undef ATTRIB_SHA
423 #undef USE_VER_MIN
424 #undef Z7_USE_HW_SHA_STUB
425