xref: /aosp_15_r20/external/lzma/C/Sha512Opt.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
2 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7 
8 // #define Z7_USE_HW_SHA_STUB // for debug
9 #ifdef MY_CPU_X86_OR_AMD64
10   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
11       #define USE_HW_SHA
12   #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 170001) \
13      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
14      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 140000)
15       #define USE_HW_SHA
16       #if !defined(__INTEL_COMPILER)
17       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18       #if !defined(__SHA512__) || !defined(__AVX2__)
19         #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
20       #endif
21       #endif
22   #elif defined(Z7_MSC_VER_ORIGINAL)
23     #if (_MSC_VER >= 1940)
24       #define USE_HW_SHA
25     #else
26       // #define Z7_USE_HW_SHA_STUB
27     #endif
28   #endif
29 // #endif // MY_CPU_X86_OR_AMD64
30 #ifndef USE_HW_SHA
31   // #define Z7_USE_HW_SHA_STUB // for debug
32 #endif
33 
34 #ifdef USE_HW_SHA
35 
36 // #pragma message("Sha512 HW")
37 
38 #include <immintrin.h>
39 
40 #if defined (__clang__) && defined(_MSC_VER)
41   #if !defined(__AVX__)
42     #include <avxintrin.h>
43   #endif
44   #if !defined(__AVX2__)
45     #include <avx2intrin.h>
46   #endif
47   #if !defined(__SHA512__)
48     #include <sha512intrin.h>
49   #endif
50 #else
51 
52 #endif
53 
54 /*
55 SHA512 uses:
56 AVX:
57   _mm256_loadu_si256  (vmovdqu)
58   _mm256_storeu_si256
59   _mm256_set_epi32    (unused)
60 AVX2:
61   _mm256_add_epi64     : vpaddq
62   _mm256_shuffle_epi8  : vpshufb
63   _mm256_shuffle_epi32 : pshufd
64   _mm256_blend_epi32   : vpblendd
65   _mm256_permute4x64_epi64 : vpermq     : 3c
66   _mm256_permute2x128_si256: vperm2i128 : 3c
67   _mm256_extracti128_si256 : vextracti128  : 3c
68 SHA512:
69   _mm256_sha512*
70 */
71 
72 // K array must be aligned for 32-bytes at least.
73 // The compiler can look align attribute and selects
74 //  vmovdqu - for code without align attribute
75 //  vmovdqa - for code with    align attribute
76 extern
77 MY_ALIGN(64)
78 const UInt64 SHA512_K_ARRAY[80];
79 #define K SHA512_K_ARRAY
80 
81 
82 #define ADD_EPI64(dest, src)      dest = _mm256_add_epi64(dest, src);
83 #define SHA512_MSG1(dest, src)    dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
84 #define SHA512_MSG2(dest, src)    dest = _mm256_sha512msg2_epi64(dest, src);
85 
86 #define LOAD_SHUFFLE(m, k) \
87     m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
88     m = _mm256_shuffle_epi8(m, mask); \
89 
90 #define NNN(m0, m1, m2, m3)
91 
92 #define SM1(m1, m2, m3, m0) \
93     SHA512_MSG1(m0, m1); \
94 
95 #define SM2(m2, m3, m0, m1) \
96     ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
97     SHA512_MSG2(m0, m3); \
98 
99 #define RND2(t0, t1, lane) \
100     t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
101 
102 
103 
104 #define R4(k, m0, m1, m2, m3, OP0, OP1) \
105     msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
106     RND2(state0, state1, 0);  OP0(m0, m1, m2, m3) \
107     RND2(state1, state0, 1);  OP1(m0, m1, m2, m3) \
108 
109 
110 
111 
112 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
113     R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
114     R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
115     R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
116     R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
117 
118 #define PREPARE_STATE \
119     state0 = _mm256_shuffle_epi32(state0, 0x4e);              /* cdab */ \
120     state1 = _mm256_shuffle_epi32(state1, 0x4e);              /* ghef */ \
121     tmp = state0; \
122     state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
123     state1 = _mm256_permute2x128_si256(tmp,    state1, 2);    /* abef */ \
124 
125 
126 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
127 #ifdef ATTRIB_SHA512
128 ATTRIB_SHA512
129 #endif
Sha512_UpdateBlocks_HW(UInt64 state[8],const Byte * data,size_t numBlocks)130 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
131 {
132   const __m256i mask = _mm256_set_epi32(
133       0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
134       0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
135   __m256i tmp, state0, state1;
136 
137   if (numBlocks == 0)
138     return;
139 
140   state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
141   state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
142 
143   PREPARE_STATE
144 
145   do
146   {
147     __m256i state0_save, state1_save;
148     __m256i m0, m1, m2, m3;
149     __m256i msg;
150     // #define msg tmp
151 
152     state0_save = state0;
153     state1_save = state1;
154 
155     LOAD_SHUFFLE (m0, 0)
156     LOAD_SHUFFLE (m1, 1)
157     LOAD_SHUFFLE (m2, 2)
158     LOAD_SHUFFLE (m3, 3)
159 
160 
161 
162     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
163     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
164     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
165     R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
166     R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
167     ADD_EPI64(state0, state0_save)
168     ADD_EPI64(state1, state1_save)
169 
170     data += 128;
171   }
172   while (--numBlocks);
173 
174   PREPARE_STATE
175 
176   _mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
177   _mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
178 }
179 
180 #endif // USE_HW_SHA
181 
182 // gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
183 #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
184 
185   #if defined(__ARM_FEATURE_SHA512)
186     #define USE_HW_SHA
187   #else
188     #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
189            || defined(__GNUC__) && (__GNUC__ >= 9) \
190           ) \
191       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
192       #define USE_HW_SHA
193     #endif
194   #endif
195 
196 #ifdef USE_HW_SHA
197 
198 // #pragma message("=== Sha512 HW === ")
199 
200 
201 #if defined(__clang__) || defined(__GNUC__)
202 #if !defined(__ARM_FEATURE_SHA512)
203 // #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
204 #if defined(__clang__)
205     #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
206 #else
207     #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
208 #endif
209 #endif
210 #endif
211 
212 
213 #if defined(Z7_MSC_VER_ORIGINAL)
214 #include <arm64_neon.h>
215 #else
216 
217 #if defined(__clang__) && __clang_major__ < 16
218 #if !defined(__ARM_FEATURE_SHA512)
219 // #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
220     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
221     #define Z7_ARM_FEATURE_SHA512_WAS_SET 1
222     #define __ARM_FEATURE_SHA512 1
223     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
224 #endif
225 #endif // clang
226 
227 #include <arm_neon.h>
228 
229 #if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
230     defined(__ARM_FEATURE_SHA512)
231     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
232     #undef __ARM_FEATURE_SHA512
233     #undef Z7_ARM_FEATURE_SHA512_WAS_SET
234     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
235 // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
236 #endif
237 
238 #endif // Z7_MSC_VER_ORIGINAL
239 
240 typedef uint64x2_t v128_64;
241 // typedef __n128 v128_64; // MSVC
242 
243 #ifdef MY_CPU_BE
244   #define MY_rev64_for_LE(x) x
245 #else
246   #define MY_rev64_for_LE(x) vrev64q_u8(x)
247 #endif
248 
249 #define LOAD_128_64(_p)       vld1q_u64(_p)
250 #define LOAD_128_8(_p)        vld1q_u8 (_p)
251 #define STORE_128_64(_p, _v)  vst1q_u64(_p, _v)
252 
253 #define LOAD_SHUFFLE(m, k) \
254     m = vreinterpretq_u64_u8( \
255         MY_rev64_for_LE( \
256         LOAD_128_8(data + (k) * 16))); \
257 
258 // K array must be aligned for 16-bytes at least.
259 extern
260 MY_ALIGN(64)
261 const UInt64 SHA512_K_ARRAY[80];
262 #define K SHA512_K_ARRAY
263 
264 #define NN(m0, m1, m4, m5, m7)
265 #define SM(m0, m1, m4, m5, m7) \
266     m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
267 
268 #define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
269     OP(m0, m1, m4, m5, m7) \
270     t = vaddq_u64(m0, vld1q_u64(k)); \
271     t = vaddq_u64(vextq_u64(t, t, 1), a3); \
272     t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
273     a3 = vsha512h2q_u64(t, a1, a0); \
274     a1 = vaddq_u64(a1, t); \
275 
276 #define R8(k,     m0,m1,m2,m3,m4,m5,m6,m7, OP) \
277     R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
278     R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
279     R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
280     R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
281 
282 #define R16(k, OP) \
283     R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
284     R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
285 
286 
287 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
288 #ifdef ATTRIB_SHA512
289 ATTRIB_SHA512
290 #endif
Sha512_UpdateBlocks_HW(UInt64 state[8],const Byte * data,size_t numBlocks)291 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
292 {
293   v128_64 a0, a1, a2, a3;
294 
295   if (numBlocks == 0)
296     return;
297   a0 = LOAD_128_64(&state[0]);
298   a1 = LOAD_128_64(&state[2]);
299   a2 = LOAD_128_64(&state[4]);
300   a3 = LOAD_128_64(&state[6]);
301   do
302   {
303     v128_64 a0_save, a1_save, a2_save, a3_save;
304     v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
305     v128_64 t;
306     unsigned i;
307     const UInt64 *k_ptr;
308 
309     LOAD_SHUFFLE (m0, 0)
310     LOAD_SHUFFLE (m1, 1)
311     LOAD_SHUFFLE (m2, 2)
312     LOAD_SHUFFLE (m3, 3)
313     LOAD_SHUFFLE (m4, 4)
314     LOAD_SHUFFLE (m5, 5)
315     LOAD_SHUFFLE (m6, 6)
316     LOAD_SHUFFLE (m7, 7)
317 
318     a0_save = a0;
319     a1_save = a1;
320     a2_save = a2;
321     a3_save = a3;
322 
323     R16 ( K, NN )
324     k_ptr = K + 16;
325     for (i = 0; i < 4; i++)
326     {
327       R16 ( k_ptr, SM )
328       k_ptr += 16;
329     }
330 
331     a0 = vaddq_u64(a0, a0_save);
332     a1 = vaddq_u64(a1, a1_save);
333     a2 = vaddq_u64(a2, a2_save);
334     a3 = vaddq_u64(a3, a3_save);
335 
336     data += 128;
337   }
338   while (--numBlocks);
339 
340   STORE_128_64(&state[0], a0);
341   STORE_128_64(&state[2], a1);
342   STORE_128_64(&state[4], a2);
343   STORE_128_64(&state[6], a3);
344 }
345 
346 #endif // USE_HW_SHA
347 
348 #endif // MY_CPU_ARM_OR_ARM64
349 
350 
351 #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
352 // #error Stop_Compiling_UNSUPPORTED_SHA
353 // #include <stdlib.h>
354 // We can compile this file with another C compiler,
355 // or we can compile asm version.
356 // So we can generate real code instead of this stub function.
357 // #include "Sha512.h"
358 // #if defined(_MSC_VER)
359 #pragma message("Sha512 HW-SW stub was used")
360 // #endif
361 void Z7_FASTCALL Sha512_UpdateBlocks   (UInt64 state[8], const Byte *data, size_t numBlocks);
362 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
Sha512_UpdateBlocks_HW(UInt64 state[8],const Byte * data,size_t numBlocks)363 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
364 {
365   Sha512_UpdateBlocks(state, data, numBlocks);
366   /*
367   UNUSED_VAR(state);
368   UNUSED_VAR(data);
369   UNUSED_VAR(numBlocks);
370   exit(1);
371   return;
372   */
373 }
374 #endif
375 
376 
377 #undef K
378 #undef RND2
379 #undef MY_rev64_for_LE
380 #undef NN
381 #undef NNN
382 #undef LOAD_128
383 #undef STORE_128
384 #undef LOAD_SHUFFLE
385 #undef SM1
386 #undef SM2
387 #undef SM
388 #undef R2
389 #undef R4
390 #undef R16
391 #undef PREPARE_STATE
392 #undef USE_HW_SHA
393 #undef ATTRIB_SHA512
394 #undef USE_VER_MIN
395 #undef Z7_USE_HW_SHA_STUB
396