xref: /aosp_15_r20/external/lzma/C/Blake2s.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* Blake2s.c -- BLAKE2sp Hash
2 2024-05-18 : Igor Pavlov : Public domain
3 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
4 
5 #include "Precomp.h"
6 
7 // #include <stdio.h>
8 #include <string.h>
9 
10 #include "Blake2.h"
11 #include "RotateDefs.h"
12 #include "Compiler.h"
13 #include "CpuArch.h"
14 
15 /*
16   if defined(__AVX512F__) && defined(__AVX512VL__)
17   {
18     we define Z7_BLAKE2S_USE_AVX512_ALWAYS,
19     but the compiler can use avx512 for any code.
20   }
21   else if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
22     { we use avx512 only for sse* and avx* branches of code. }
23 */
24 // #define Z7_BLAKE2S_USE_AVX512_ALWAYS // for debug
25 
26 #if defined(__SSE2__)
27     #define Z7_BLAKE2S_USE_VECTORS
28 #elif defined(MY_CPU_X86_OR_AMD64)
29   #if  defined(_MSC_VER) && _MSC_VER > 1200 \
30     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
31     || defined(__clang__) \
32     || defined(__INTEL_COMPILER)
33     #define Z7_BLAKE2S_USE_VECTORS
34   #endif
35 #endif
36 
37 #ifdef Z7_BLAKE2S_USE_VECTORS
38 
39 #define Z7_BLAKE2SP_USE_FUNCTIONS
40 
41 //  define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
42 // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
43 
44 // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
45 #if defined(__SSSE3__)
46   #define Z7_BLAKE2S_USE_SSSE3
47 #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
48     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
49     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
50     || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
51     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
52   #define Z7_BLAKE2S_USE_SSSE3
53 #endif
54 
55 #ifdef Z7_BLAKE2S_USE_SSSE3
56 /* SSE41 : for _mm_insert_epi32 (pinsrd)
57   it can slightly reduce code size and improves the performance in some cases.
58     it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
59     it can be used for all blocks in another algos (4+).
60 */
61 #if defined(__SSE4_1__)
62   #define Z7_BLAKE2S_USE_SSE41
63 #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
64     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
65     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
66     || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
67     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
68   #define Z7_BLAKE2S_USE_SSE41
69 #endif
70 #endif // SSSE3
71 
72 #if defined(__GNUC__) || defined(__clang__)
73 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
74     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("avx512vl,avx512f")))
75 #else
76   #if defined(Z7_BLAKE2S_USE_SSE41)
77     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("sse4.1")))
78   #elif defined(Z7_BLAKE2S_USE_SSSE3)
79     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("ssse3")))
80   #else
81     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("sse2")))
82   #endif
83 #endif
84 #endif
85 
86 
87 #if defined(__AVX2__)
88   #define Z7_BLAKE2S_USE_AVX2
89 #else
90   #if    defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
91       || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
92       || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
93     #define Z7_BLAKE2S_USE_AVX2
94     #ifdef Z7_BLAKE2S_USE_AVX2
95 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
96       #define BLAKE2S_ATTRIB_AVX2  __attribute__((__target__("avx512vl,avx512f")))
97 #else
98       #define BLAKE2S_ATTRIB_AVX2  __attribute__((__target__("avx2")))
99 #endif
100     #endif
101   #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
102       || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
103     #if (Z7_MSC_VER_ORIGINAL == 1900)
104       #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
105     #endif
106     #define Z7_BLAKE2S_USE_AVX2
107   #endif
108 #endif
109 
110 #ifdef Z7_BLAKE2S_USE_SSE41
111 #include <smmintrin.h> // SSE4.1
112 #elif defined(Z7_BLAKE2S_USE_SSSE3)
113 #include <tmmintrin.h> // SSSE3
114 #else
115 #include <emmintrin.h> // SSE2
116 #endif
117 
118 #ifdef Z7_BLAKE2S_USE_AVX2
119 #include <immintrin.h>
120 #if defined(__clang__)
121 #include <avxintrin.h>
122 #include <avx2intrin.h>
123 #endif
124 #endif // avx2
125 
126 
127 #if defined(__AVX512F__) && defined(__AVX512VL__)
128    // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
129   #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
130   #define Z7_BLAKE2S_USE_AVX512_ALWAYS
131   #endif
132   // #pragma message ("=== Blake2s AVX512")
133 #endif
134 
135 
136 #define Z7_BLAKE2S_USE_V128_FAST
137 // for speed optimization for small messages:
138 // #define Z7_BLAKE2S_USE_V128_WAY2
139 
140 #ifdef Z7_BLAKE2S_USE_AVX2
141 
142 // for debug:
143 // gather is slow
144 // #define Z7_BLAKE2S_USE_GATHER
145 
146   #define   Z7_BLAKE2S_USE_AVX2_FAST
147 // for speed optimization for small messages:
148 //   #define   Z7_BLAKE2S_USE_AVX2_WAY2
149 //   #define   Z7_BLAKE2S_USE_AVX2_WAY4
150 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
151     defined(Z7_BLAKE2S_USE_AVX2_WAY4)
152   #define   Z7_BLAKE2S_USE_AVX2_WAY_SLOW
153 #endif
154 #endif
155 
156   #define Z7_BLAKE2SP_ALGO_DEFAULT    0
157   #define Z7_BLAKE2SP_ALGO_SCALAR     1
158 #ifdef Z7_BLAKE2S_USE_V128_FAST
159   #define Z7_BLAKE2SP_ALGO_V128_FAST  2
160 #endif
161 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
162   #define Z7_BLAKE2SP_ALGO_V256_FAST  3
163 #endif
164   #define Z7_BLAKE2SP_ALGO_V128_WAY1  4
165 #ifdef Z7_BLAKE2S_USE_V128_WAY2
166   #define Z7_BLAKE2SP_ALGO_V128_WAY2  5
167 #endif
168 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
169   #define Z7_BLAKE2SP_ALGO_V256_WAY2  6
170 #endif
171 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
172   #define Z7_BLAKE2SP_ALGO_V256_WAY4  7
173 #endif
174 
175 #endif // Z7_BLAKE2S_USE_VECTORS
176 
177 
178 
179 
180 #define BLAKE2S_FINAL_FLAG  (~(UInt32)0)
181 #define NSW                 Z7_BLAKE2SP_NUM_STRUCT_WORDS
182 #define SUPER_BLOCK_SIZE    (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
183 #define SUPER_BLOCK_MASK    (SUPER_BLOCK_SIZE - 1)
184 
185 #define V_INDEX_0_0   0
186 #define V_INDEX_1_0   1
187 #define V_INDEX_2_0   2
188 #define V_INDEX_3_0   3
189 #define V_INDEX_0_1   4
190 #define V_INDEX_1_1   5
191 #define V_INDEX_2_1   6
192 #define V_INDEX_3_1   7
193 #define V_INDEX_0_2   8
194 #define V_INDEX_1_2   9
195 #define V_INDEX_2_2  10
196 #define V_INDEX_3_2  11
197 #define V_INDEX_0_3  12
198 #define V_INDEX_1_3  13
199 #define V_INDEX_2_3  14
200 #define V_INDEX_3_3  15
201 #define V_INDEX_4_0   0
202 #define V_INDEX_5_0   1
203 #define V_INDEX_6_0   2
204 #define V_INDEX_7_0   3
205 #define V_INDEX_7_1   4
206 #define V_INDEX_4_1   5
207 #define V_INDEX_5_1   6
208 #define V_INDEX_6_1   7
209 #define V_INDEX_6_2   8
210 #define V_INDEX_7_2   9
211 #define V_INDEX_4_2  10
212 #define V_INDEX_5_2  11
213 #define V_INDEX_5_3  12
214 #define V_INDEX_6_3  13
215 #define V_INDEX_7_3  14
216 #define V_INDEX_4_3  15
217 
218 #define V(row, col)  v[V_INDEX_ ## row ## _ ## col]
219 
220 #define k_Blake2s_IV_0  0x6A09E667UL
221 #define k_Blake2s_IV_1  0xBB67AE85UL
222 #define k_Blake2s_IV_2  0x3C6EF372UL
223 #define k_Blake2s_IV_3  0xA54FF53AUL
224 #define k_Blake2s_IV_4  0x510E527FUL
225 #define k_Blake2s_IV_5  0x9B05688CUL
226 #define k_Blake2s_IV_6  0x1F83D9ABUL
227 #define k_Blake2s_IV_7  0x5BE0CD19UL
228 
229 #define KIV(n)  (k_Blake2s_IV_## n)
230 
231 #ifdef Z7_BLAKE2S_USE_VECTORS
232 MY_ALIGN(16)
233 static const UInt32 k_Blake2s_IV[8] =
234 {
235   KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
236 };
237 #endif
238 
239 #define STATE_T(s)        ((s) + 8)
240 #define STATE_F(s)        ((s) + 10)
241 
242 #ifdef Z7_BLAKE2S_USE_VECTORS
243 
244 #define LOAD_128(p)    _mm_load_si128 ((const __m128i *)(const void *)(p))
245 #define LOADU_128(p)   _mm_loadu_si128((const __m128i *)(const void *)(p))
246 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
247   // here we use unaligned load and stores
248   // use this branch if CBlake2sp can be unaligned for 16 bytes
249   #define STOREU_128(p, r)  _mm_storeu_si128((__m128i *)(void *)(p), r)
250   #define LOAD_128_FROM_STRUCT(p)     LOADU_128(p)
251   #define STORE_128_TO_STRUCT(p, r)   STOREU_128(p, r)
252 #else
253   // here we use aligned load and stores
254   // use this branch if CBlake2sp is aligned for 16 bytes
255   #define STORE_128(p, r)  _mm_store_si128((__m128i *)(void *)(p), r)
256   #define LOAD_128_FROM_STRUCT(p)     LOAD_128(p)
257   #define STORE_128_TO_STRUCT(p, r)   STORE_128(p, r)
258 #endif
259 
260 #endif // Z7_BLAKE2S_USE_VECTORS
261 
262 
263 #if 0
264 static void PrintState(const UInt32 *s, unsigned num)
265 {
266   unsigned i;
267   printf("\n");
268   for (i = 0; i < num; i++)
269     printf(" %08x", (unsigned)s[i]);
270 }
271 static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
272 {
273   unsigned i;
274   for (i = 0; i < y; i++)
275     PrintState(s + i * x, x);
276   printf("\n");
277 }
278 #endif
279 
280 
281 #define REP8_MACRO(m)  { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
282 
283 #define BLAKE2S_NUM_ROUNDS  10
284 
285 #if defined(Z7_BLAKE2S_USE_VECTORS)
286 #define ROUNDS_LOOP(mac) \
287   { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
288 #endif
289 /*
290 #define ROUNDS_LOOP_2(mac) \
291   { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
292 */
293 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
294 #define ROUNDS_LOOP_UNROLLED(m) \
295   { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
296 #endif
297 
298 #define SIGMA_TABLE(M) \
299   M(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 ), \
300   M( 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 ), \
301   M( 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 ), \
302   M(  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 ), \
303   M(  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 ), \
304   M(  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 ), \
305   M( 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 ), \
306   M( 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 ), \
307   M(  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 ), \
308   M( 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 )
309 
310 #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
311   { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
312 #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
313         SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
314 
315 // MY_ALIGN(32)
316 MY_ALIGN(16)
317 static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
318   { SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
319 
320 #define GET_SIGMA_PTR(p, index) \
321     ((const void *)((const Byte *)(const void *)(p) + (index)))
322 
323 #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
324     ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
325 
326 
327 #ifdef Z7_BLAKE2S_USE_VECTORS
328 
329 
330 #if 0
331   // use loading constants from memory
332   // is faster for some compilers.
333   #define KK4(n)  KIV(n), KIV(n), KIV(n), KIV(n)
334 MY_ALIGN(64)
335 static const UInt32 k_Blake2s_IV_WAY4[]=
336 {
337   KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
338 };
339   #define GET_128_IV_WAY4(i)  LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
340 #else
341   // use constant generation:
342   #define GET_128_IV_WAY4(i)  _mm_set1_epi32((Int32)KIV(i))
343 #endif
344 
345 
346 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
347 #define GET_CONST_128_FROM_ARRAY32(k) \
348     _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
349 #endif
350 
351 
352 #if 0
353 #define k_r8    _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
354 #define k_r16   _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
355 #define k_inc   _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
356 #define k_iv0_128  GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
357 #define k_iv4_128  GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
358 #else
359 #if  defined(Z7_BLAKE2S_USE_SSSE3) && \
360     !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
361 MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
362 MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
363 #define k_r8    LOAD_128(k_r8_arr)
364 #define k_r16   LOAD_128(k_r16_arr)
365 #endif
366 MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
367 #define k_inc   LOAD_128(k_inc_arr)
368 #define k_iv0_128  LOAD_128(k_Blake2s_IV + 0)
369 #define k_iv4_128  LOAD_128(k_Blake2s_IV + 4)
370 #endif
371 
372 
373 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
374 
375 #ifdef Z7_BLAKE2S_USE_AVX2
376 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
377   #define MY_mm256_set_m128i(hi, lo)  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
378 #else
379   #define MY_mm256_set_m128i  _mm256_set_m128i
380 #endif
381 
382 #define SET_FROM_128(a)  MY_mm256_set_m128i(a, a)
383 
384 #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
385 MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
386 {
387   1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
388   1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
389 };
390 MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
391 {
392   2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
393   2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
394 };
395 #define k_r8_256    LOAD_256(k_r8_arr_256)
396 #define k_r16_256   LOAD_256(k_r16_arr_256)
397 #endif
398 
399 // #define k_r8_256    SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
400 // #define k_r16_256   SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
401 // #define k_inc_256   SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
402 // #define k_iv0_256   SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
403 #define k_iv4_256   SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
404 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
405 #endif
406 
407 
408 /*
409 IPC(TP) ports:
410 1 p__5  : skl-      : SSE   : shufps  : _mm_shuffle_ps
411 2 p_15  : icl+
412 1 p__5  : nhm-bdw   : SSE   : xorps   : _mm_xor_ps
413 3 p015  : skl+
414 
415 3 p015              : SSE2  : pxor    : _mm_xor_si128
416 2 p_15:   snb-bdw   : SSE2  : padd    : _mm_add_epi32
417 2 p0_5:   mrm-wsm   :
418 3 p015  : skl+
419 
420 2 p_15  : ivb-,icl+ : SSE2  : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
421 2 p_15  :           : SSE2  : pshufd  : _mm_shuffle_epi32
422 2 p_15  :           : SSE2  : pshuflw : _mm_shufflelo_epi16
423 2 p_15  :           : SSE2  : psrldq  :
424 2 p_15  :           : SSE3  : pshufb  : _mm_shuffle_epi8
425 2 p_15  :           : SSE4  : pblendw : _mm_blend_epi16
426 1 p__5  : hsw-skl   : *
427 
428 1 p0                : SSE2  : pslld (i8) : _mm_slli_si128
429 2 p01   : skl+      :
430 
431 2 p_15  : ivb-      : SSE3  : palignr
432 1 p__5  : hsw+
433 
434 2 p_15 + p23 : ivb-, icl+ : SSE4   : pinsrd  : _mm_insert_epi32(xmm, m32, i8)
435 1 p__5 + p23 : hsw-skl
436 1 p_15 + p5  : ivb-, ice+ : SSE4   : pinsrd  : _mm_insert_epi32(xmm, r32, i8)
437 0.5    2*p5  : hsw-skl
438 
439 2 p23               : SSE2   : movd (m32)
440 3 p23A  : adl       :
441 1 p5:               : SSE2   : movd (r32)
442 */
443 
444 #if 0 && defined(__XOP__)
445 // we must debug and test __XOP__ instruction
446 #include <x86intrin.h>
447 #include <ammintrin.h>
448     #define LOAD_ROTATE_CONSTS
449     #define MM_ROR_EPI32(r, c)  _mm_roti_epi32(r, -(c))
450     #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
451 #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
452     #define LOAD_ROTATE_CONSTS
453     #define MM_ROR_EPI32(r, c)  _mm_ror_epi32(r, c)
454     #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
455 #else
456 
457 // MSVC_1937+ uses "orps" instruction for _mm_or_si128().
458 // But "orps" has low throughput: TP=1 for bdw-nhm.
459 // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
460 // But "orps" is fast for modern cpus (skl+).
461 // So we are default with "or" version:
462 #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
463   // minor optimization for some old cpus, if "xorps" is slow.
464   #define MM128_EPI32_OR_or_ADD  _mm_add_epi32
465 #else
466   #define MM128_EPI32_OR_or_ADD  _mm_or_si128
467 #endif
468 
469   #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
470     MM128_EPI32_OR_or_ADD( \
471       _mm_srli_epi32((r), (c)), \
472       _mm_slli_epi32((r), 32-(c))))
473   #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
474     #define LOAD_ROTATE_CONSTS \
475       const __m128i  r8 = k_r8; \
476       const __m128i r16 = k_r16;
477     #define MM_ROR_EPI32(r, c) ( \
478       ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
479     : (16==(c)) ? _mm_shuffle_epi8(r,r16) \
480     : MM_ROR_EPI32_VIA_SHIFT(r, c))
481   #else
482     #define LOAD_ROTATE_CONSTS
483     #define  MM_ROR_EPI32(r, c) ( \
484       (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
485     : MM_ROR_EPI32_VIA_SHIFT(r, c))
486   #endif
487 #endif
488 
489 /*
490 we have 3 main ways to load 4 32-bit integers to __m128i:
491   1) SSE2:  _mm_set_epi32()
492   2) SSE2:  _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
493   3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
494 good compiler for _mm_set_epi32() generates these instructions:
495 {
496   movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
497 }
498 good new compiler generates one instruction
499 {
500   for _mm_insert_epi32()  : { pinsrd xmm, [m32], i }
501   for _mm_cvtsi32_si128() : { movd xmm, [m32] }
502 }
503 but vc2010 generates slow pair of instructions:
504 {
505   for _mm_insert_epi32()  : { mov r32, [m32];  pinsrd xmm, r32, i  }
506   for _mm_cvtsi32_si128() : { mov r32, [m32];  movd  xmm, r32 }
507 }
508 _mm_insert_epi32() (pinsrd) code reduces xmm register pressure
509 in comparison with _mm_set_epi32() (movd + vpunpckld) code.
510 Note that variant with "movd xmm, r32" can be more slow,
511 but register pressure can be more important.
512 So we can force to "pinsrd" always.
513 */
514 // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
515 #ifdef Z7_BLAKE2S_USE_SSE41
516   /* _mm_set_epi32() can be more effective for GCC and CLANG
517      _mm_insert_epi32()  is more effective for MSVC */
518   #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
519     #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
520   #endif
521 #endif // USE_SSE41
522 // #endif
523 
524 #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
525   // for SSE4.1
526 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
527     _mm_insert_epi32( \
528     _mm_insert_epi32( \
529     _mm_insert_epi32( \
530     _mm_cvtsi32_si128( \
531         *(const Int32 *)p0), \
532         *(const Int32 *)p1, 1), \
533         *(const Int32 *)p2, 2), \
534         *(const Int32 *)p3, 3)
535 #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
536 /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
537    Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
538    But _mm_set_epi32() is more effective for GCC and CLANG.
539    So we use _mm_unpacklo_epi32 for MSVC only */
540 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
541     _mm_unpacklo_epi64(  \
542         _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0),  \
543                             _mm_cvtsi32_si128(*(const Int32 *)p1)), \
544         _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2),  \
545                             _mm_cvtsi32_si128(*(const Int32 *)p3)))
546 #else
547 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
548     _mm_set_epi32( \
549         *(const Int32 *)p3, \
550         *(const Int32 *)p2, \
551         *(const Int32 *)p1, \
552         *(const Int32 *)p0)
553 #endif
554 
555 #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)  \
556       MM_LOAD_EPI32_FROM_4_POINTERS( \
557         GET_SIGMA_PTR(input, i0), \
558         GET_SIGMA_PTR(input, i1), \
559         GET_SIGMA_PTR(input, i2), \
560         GET_SIGMA_PTR(input, i3))
561 
562 #define SET_ROW_FROM_SIGMA(input, sigma_index)  \
563         SET_ROW_FROM_SIGMA_BASE(input, \
564             sigma[(sigma_index)        ], \
565             sigma[(sigma_index) + 2 * 1], \
566             sigma[(sigma_index) + 2 * 2], \
567             sigma[(sigma_index) + 2 * 3]) \
568 
569 
570 #define ADD_128(a, b)   _mm_add_epi32(a, b)
571 #define XOR_128(a, b)   _mm_xor_si128(a, b)
572 
573 #define D_ADD_128(dest, src)        dest = ADD_128(dest, src)
574 #define D_XOR_128(dest, src)        dest = XOR_128(dest, src)
575 #define D_ROR_128(dest, shift)      dest = MM_ROR_EPI32(dest, shift)
576 #define D_ADD_EPI64_128(dest, src)  dest = _mm_add_epi64(dest, src)
577 
578 
579 #define AXR(a, b, d, shift) \
580     D_ADD_128(a, b); \
581     D_XOR_128(d, a); \
582     D_ROR_128(d, shift);
583 
584 #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
585     a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
586     AXR(a, b, d, shift1) \
587     AXR(c, d, b, shift2)
588 
589 #define ROTATE_WORDS_TO_RIGHT(a, n) \
590     a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
591 
592 #define AXR4(a, b, c, d, input, sigma_index)  \
593     AXR2(a, b, c, d, input, sigma_index,     16, 12) \
594     AXR2(a, b, c, d, input, sigma_index + 1,  8,  7) \
595 
596 #define RR2(a, b, c, d, input) \
597   { \
598     AXR4(a, b, c, d, input, 0) \
599       ROTATE_WORDS_TO_RIGHT(b, 1) \
600       ROTATE_WORDS_TO_RIGHT(c, 2) \
601       ROTATE_WORDS_TO_RIGHT(d, 3) \
602     AXR4(a, b, c, d, input, 8) \
603       ROTATE_WORDS_TO_RIGHT(b, 3) \
604       ROTATE_WORDS_TO_RIGHT(c, 2) \
605       ROTATE_WORDS_TO_RIGHT(d, 1) \
606   }
607 
608 
609 /*
610 Way1:
611 per 64 bytes block:
612 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
613                     * (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
614 additional operations per 7_op_iter :
615 4 movzx   byte mem
616 1 movd    mem
617 3 pinsrd  mem
618 1.5 pshufd
619 */
620 
621 static
622 #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
623                defined(Z7_BLAKE2S_USE_V256_WAY2))
624   Z7_NO_INLINE
625 #else
626   Z7_FORCE_INLINE
627 #endif
628 #ifdef BLAKE2S_ATTRIB_128BIT
629        BLAKE2S_ATTRIB_128BIT
630 #endif
631 void
632 Z7_FASTCALL
Blake2s_Compress_V128_Way1(UInt32 * const s,const Byte * const input)633 Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
634 {
635   __m128i a, b, c, d;
636   __m128i f0, f1;
637 
638   LOAD_ROTATE_CONSTS
639   d = LOAD_128_FROM_STRUCT(STATE_T(s));
640   c = k_iv0_128;
641   a = f0 = LOAD_128_FROM_STRUCT(s);
642   b = f1 = LOAD_128_FROM_STRUCT(s + 4);
643   D_ADD_EPI64_128(d, k_inc);
644   STORE_128_TO_STRUCT (STATE_T(s), d);
645   D_XOR_128(d, k_iv4_128);
646 
647 #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
648       RR2(a, b, c, d, input) }
649 
650   ROUNDS_LOOP(RR)
651 #undef RR
652 
653   STORE_128_TO_STRUCT(s    , XOR_128(f0, XOR_128(a, c)));
654   STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
655 }
656 
657 
658 static
659 Z7_NO_INLINE
660 #ifdef BLAKE2S_ATTRIB_128BIT
661        BLAKE2S_ATTRIB_128BIT
662 #endif
663 void
664 Z7_FASTCALL
Blake2sp_Compress2_V128_Way1(UInt32 * s_items,const Byte * data,const Byte * end)665 Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
666 {
667   size_t pos = 0;
668   do
669   {
670     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
671     Blake2s_Compress_V128_Way1(s, data);
672     data += Z7_BLAKE2S_BLOCK_SIZE;
673     pos  += Z7_BLAKE2S_BLOCK_SIZE;
674     pos &= SUPER_BLOCK_MASK;
675   }
676   while (data != end);
677 }
678 
679 
680 #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
681     defined(Z7_BLAKE2S_USE_AVX2_WAY2)
682 #if 1
683   #define Z7_BLAKE2S_CompressSingleBlock(s, data) \
684     Blake2sp_Compress2_V128_Way1(s, data, \
685         (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
686 #else
687   #define Z7_BLAKE2S_CompressSingleBlock  Blake2s_Compress_V128_Way1
688 #endif
689 #endif
690 
691 
692 #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
693      defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
694     !defined(Z7_BLAKE2S_USE_GATHER)
695 #define AXR2_LOAD_INDEXES(sigma_index) \
696       const unsigned i0 = sigma[(sigma_index)]; \
697       const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
698       const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
699       const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
700 
701 #define SET_ROW_FROM_SIGMA_W(input) \
702     SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
703 #endif
704 
705 
706 #ifdef Z7_BLAKE2S_USE_V128_WAY2
707 
708 #if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
709 /* we use SET_ROW_FROM_SIGMA_BASE, that uses
710    (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
711    (SSE2) _mm_set_epi32()
712    MSVC can be faster for this branch:
713 */
714 #define AXR2_W(sigma_index, shift1, shift2) \
715   { \
716     AXR2_LOAD_INDEXES(sigma_index) \
717     a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
718     a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
719     AXR(a0, b0, d0, shift1) \
720     AXR(a1, b1, d1, shift1) \
721     AXR(c0, d0, b0, shift2) \
722     AXR(c1, d1, b1, shift2) \
723   }
724 #else
725 /* we use interleaved _mm_insert_epi32():
726    GCC can be faster for this branch:
727 */
728 #define AXR2_W_PRE_INSERT(sigma_index, i) \
729   { const unsigned ii = sigma[(sigma_index) + i * 2]; \
730     t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii),                      i); \
731     t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
732   }
733 #define AXR2_W(sigma_index, shift1, shift2) \
734   { __m128i t0, t1; \
735     { const unsigned ii = sigma[sigma_index]; \
736       t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
737       t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
738     } \
739     AXR2_W_PRE_INSERT(sigma_index, 1) \
740     AXR2_W_PRE_INSERT(sigma_index, 2) \
741     AXR2_W_PRE_INSERT(sigma_index, 3) \
742     a0 = _mm_add_epi32(a0, t0); \
743     a1 = _mm_add_epi32(a1, t1); \
744     AXR(a0, b0, d0, shift1) \
745     AXR(a1, b1, d1, shift1) \
746     AXR(c0, d0, b0, shift2) \
747     AXR(c1, d1, b1, shift2) \
748   }
749 #endif
750 
751 
752 #define AXR4_W(sigma_index) \
753     AXR2_W(sigma_index,     16, 12) \
754     AXR2_W(sigma_index + 1,  8,  7) \
755 
756 #define WW(r) \
757   { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
758     AXR4_W(0) \
759       ROTATE_WORDS_TO_RIGHT(b0, 1) \
760       ROTATE_WORDS_TO_RIGHT(b1, 1) \
761       ROTATE_WORDS_TO_RIGHT(c0, 2) \
762       ROTATE_WORDS_TO_RIGHT(c1, 2) \
763       ROTATE_WORDS_TO_RIGHT(d0, 3) \
764       ROTATE_WORDS_TO_RIGHT(d1, 3) \
765     AXR4_W(8) \
766       ROTATE_WORDS_TO_RIGHT(b0, 3) \
767       ROTATE_WORDS_TO_RIGHT(b1, 3) \
768       ROTATE_WORDS_TO_RIGHT(c0, 2) \
769       ROTATE_WORDS_TO_RIGHT(c1, 2) \
770       ROTATE_WORDS_TO_RIGHT(d0, 1) \
771       ROTATE_WORDS_TO_RIGHT(d1, 1) \
772   }
773 
774 
775 static
776 Z7_NO_INLINE
777 #ifdef BLAKE2S_ATTRIB_128BIT
778        BLAKE2S_ATTRIB_128BIT
779 #endif
780 void
781 Z7_FASTCALL
Blake2sp_Compress2_V128_Way2(UInt32 * s_items,const Byte * data,const Byte * end)782 Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
783 {
784   size_t pos = 0;
785   end -= Z7_BLAKE2S_BLOCK_SIZE;
786 
787   if (data != end)
788   {
789     LOAD_ROTATE_CONSTS
790     do
791     {
792       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
793       __m128i a0, b0, c0, d0;
794       __m128i a1, b1, c1, d1;
795       {
796         const __m128i inc = k_inc;
797         const __m128i temp = k_iv4_128;
798         d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
799         d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
800         D_ADD_EPI64_128(d0, inc);
801         D_ADD_EPI64_128(d1, inc);
802         STORE_128_TO_STRUCT (STATE_T(s      ), d0);
803         STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
804         D_XOR_128(d0, temp);
805         D_XOR_128(d1, temp);
806       }
807       c1 = c0 = k_iv0_128;
808       a0 = LOAD_128_FROM_STRUCT(s);
809       b0 = LOAD_128_FROM_STRUCT(s + 4);
810       a1 = LOAD_128_FROM_STRUCT(s + NSW);
811       b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
812 
813       ROUNDS_LOOP (WW)
814 
815 #undef WW
816 
817       D_XOR_128(a0, c0);
818       D_XOR_128(b0, d0);
819       D_XOR_128(a1, c1);
820       D_XOR_128(b1, d1);
821 
822       D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
823       D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
824       D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
825       D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
826 
827       STORE_128_TO_STRUCT(s,           a0);
828       STORE_128_TO_STRUCT(s + 4,       b0);
829       STORE_128_TO_STRUCT(s + NSW,     a1);
830       STORE_128_TO_STRUCT(s + NSW + 4, b1);
831 
832       data += Z7_BLAKE2S_BLOCK_SIZE * 2;
833       pos  += Z7_BLAKE2S_BLOCK_SIZE * 2;
834       pos &= SUPER_BLOCK_MASK;
835     }
836     while (data < end);
837     if (data != end)
838       return;
839   }
840   {
841     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
842     Z7_BLAKE2S_CompressSingleBlock(s, data);
843   }
844 }
845 #endif // Z7_BLAKE2S_USE_V128_WAY2
846 
847 
848 #ifdef Z7_BLAKE2S_USE_V128_WAY2
849   #define Z7_BLAKE2S_Compress2_V128  Blake2sp_Compress2_V128_Way2
850 #else
851   #define Z7_BLAKE2S_Compress2_V128  Blake2sp_Compress2_V128_Way1
852 #endif
853 
854 
855 
856 #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
857   #define ROT_128_8(x)    MM_ROR_EPI32(x, 8)
858   #define ROT_128_16(x)   MM_ROR_EPI32(x, 16)
859   #define ROT_128_7(x)    MM_ROR_EPI32(x, 7)
860   #define ROT_128_12(x)   MM_ROR_EPI32(x, 12)
861 #else
862 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
863   #define ROT_128_8(x)    _mm_shuffle_epi8(x, r8)   // k_r8
864   #define ROT_128_16(x)   _mm_shuffle_epi8(x, r16)  // k_r16
865 #else
866   #define ROT_128_8(x)    MM_ROR_EPI32_VIA_SHIFT(x, 8)
867   #define ROT_128_16(x)   MM_ROR_EPI32_VIA_SHIFT(x, 16)
868 #endif
869   #define ROT_128_7(x)    MM_ROR_EPI32_VIA_SHIFT(x, 7)
870   #define ROT_128_12(x)   MM_ROR_EPI32_VIA_SHIFT(x, 12)
871 #endif
872 
873 
874 #if 1
875 // this branch can provide similar speed on x86* in most cases,
876 // because [base + index*4] provides same speed as [base + index].
877 // but some compilers can generate different code with this branch, that can be faster sometimes.
878 // this branch uses additional table of 10*16=160 bytes.
879 #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
880         SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
881 MY_ALIGN(16)
882 static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
883   { SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
884 #define GET_SIGMA_PTR_128(r)  const Byte * const sigma = k_Blake2s_Sigma_16[r];
885 #define GET_SIGMA_VAL_128(n)  (sigma[n])
886 #else
887 #define GET_SIGMA_PTR_128(r)  const Byte * const sigma = k_Blake2s_Sigma_4[r];
888 #define GET_SIGMA_VAL_128(n)  (4 * (size_t)sigma[n])
889 #endif
890 
891 
892 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
893 #if 1
894 #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
895         SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
896 MY_ALIGN(64)
897 static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
898   { SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
899 #define GET_SIGMA_PTR_256(r)  const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
900 #define GET_SIGMA_VAL_256(n)  (sigma[n])
901 #else
902 #define GET_SIGMA_PTR_256(r)  const Byte * const sigma = k_Blake2s_Sigma_4[r];
903 #define GET_SIGMA_VAL_256(n)  (8 * (size_t)sigma[n])
904 #endif
905 #endif // Z7_BLAKE2S_USE_AVX2_FAST
906 
907 
908 #define D_ROT_128_7(dest)     dest = ROT_128_7(dest)
909 #define D_ROT_128_8(dest)     dest = ROT_128_8(dest)
910 #define D_ROT_128_12(dest)    dest = ROT_128_12(dest)
911 #define D_ROT_128_16(dest)    dest = ROT_128_16(dest)
912 
913 #define OP_L(a, i)   D_ADD_128 (V(a, 0), \
914     LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
915 
916 #define OP_0(a)   OP_L(a, 0)
917 #define OP_7(a)   OP_L(a, 1)
918 
919 #define OP_1(a)   D_ADD_128 (V(a, 0), V(a, 1));
920 #define OP_2(a)   D_XOR_128 (V(a, 3), V(a, 0));
921 #define OP_4(a)   D_ADD_128 (V(a, 2), V(a, 3));
922 #define OP_5(a)   D_XOR_128 (V(a, 1), V(a, 2));
923 
924 #define OP_3(a)   D_ROT_128_16 (V(a, 3));
925 #define OP_6(a)   D_ROT_128_12 (V(a, 1));
926 #define OP_8(a)   D_ROT_128_8  (V(a, 3));
927 #define OP_9(a)   D_ROT_128_7  (V(a, 1));
928 
929 
930 // for 32-bit x86 : interleave mode works slower, because of register pressure.
931 
932 #if 0 || 1 && (defined(MY_CPU_X86) \
933   || defined(__GNUC__) && !defined(__clang__))
934 // non-inteleaved version:
935 // is fast for x86 32-bit.
936 // is fast for GCC x86-64.
937 
938 #define V4G(a) \
939   OP_0 (a) \
940   OP_1 (a) \
941   OP_2 (a) \
942   OP_3 (a) \
943   OP_4 (a) \
944   OP_5 (a) \
945   OP_6 (a) \
946   OP_7 (a) \
947   OP_1 (a) \
948   OP_2 (a) \
949   OP_8 (a) \
950   OP_4 (a) \
951   OP_5 (a) \
952   OP_9 (a) \
953 
954 #define V4R \
955 { \
956   V4G (0) \
957   V4G (1) \
958   V4G (2) \
959   V4G (3) \
960   V4G (4) \
961   V4G (5) \
962   V4G (6) \
963   V4G (7) \
964 }
965 
966 #elif 0 || 1 && defined(MY_CPU_X86)
967 
968 #define OP_INTER_2(op, a,b) \
969   op (a) \
970   op (b) \
971 
972 #define V4G(a,b) \
973   OP_INTER_2 (OP_0, a,b) \
974   OP_INTER_2 (OP_1, a,b) \
975   OP_INTER_2 (OP_2, a,b) \
976   OP_INTER_2 (OP_3, a,b) \
977   OP_INTER_2 (OP_4, a,b) \
978   OP_INTER_2 (OP_5, a,b) \
979   OP_INTER_2 (OP_6, a,b) \
980   OP_INTER_2 (OP_7, a,b) \
981   OP_INTER_2 (OP_1, a,b) \
982   OP_INTER_2 (OP_2, a,b) \
983   OP_INTER_2 (OP_8, a,b) \
984   OP_INTER_2 (OP_4, a,b) \
985   OP_INTER_2 (OP_5, a,b) \
986   OP_INTER_2 (OP_9, a,b) \
987 
988 #define V4R \
989 { \
990   V4G (0, 1) \
991   V4G (2, 3) \
992   V4G (4, 5) \
993   V4G (6, 7) \
994 }
995 
996 #else
997 // iterleave-4 version is fast for x64 (MSVC/CLANG)
998 
999 #define OP_INTER_4(op, a,b,c,d) \
1000   op (a) \
1001   op (b) \
1002   op (c) \
1003   op (d) \
1004 
1005 #define V4G(a,b,c,d) \
1006   OP_INTER_4 (OP_0, a,b,c,d) \
1007   OP_INTER_4 (OP_1, a,b,c,d) \
1008   OP_INTER_4 (OP_2, a,b,c,d) \
1009   OP_INTER_4 (OP_3, a,b,c,d) \
1010   OP_INTER_4 (OP_4, a,b,c,d) \
1011   OP_INTER_4 (OP_5, a,b,c,d) \
1012   OP_INTER_4 (OP_6, a,b,c,d) \
1013   OP_INTER_4 (OP_7, a,b,c,d) \
1014   OP_INTER_4 (OP_1, a,b,c,d) \
1015   OP_INTER_4 (OP_2, a,b,c,d) \
1016   OP_INTER_4 (OP_8, a,b,c,d) \
1017   OP_INTER_4 (OP_4, a,b,c,d) \
1018   OP_INTER_4 (OP_5, a,b,c,d) \
1019   OP_INTER_4 (OP_9, a,b,c,d) \
1020 
1021 #define V4R \
1022 { \
1023   V4G (0, 1, 2, 3) \
1024   V4G (4, 5, 6, 7) \
1025 }
1026 
1027 #endif
1028 
1029 #define V4_ROUND(r)  { GET_SIGMA_PTR_128(r); V4R }
1030 
1031 
1032 #define V4_LOAD_MSG_1(w, m, i) \
1033 { \
1034   __m128i m0, m1, m2, m3; \
1035   __m128i t0, t1, t2, t3; \
1036   m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
1037   m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
1038   m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
1039   m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
1040   t0 = _mm_unpacklo_epi32(m0, m1); \
1041   t1 = _mm_unpackhi_epi32(m0, m1); \
1042   t2 = _mm_unpacklo_epi32(m2, m3); \
1043   t3 = _mm_unpackhi_epi32(m2, m3); \
1044   w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
1045   w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
1046   w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
1047   w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
1048 }
1049 
1050 #define V4_LOAD_MSG(w, m) \
1051 { \
1052   V4_LOAD_MSG_1 (w, m, 0) \
1053   V4_LOAD_MSG_1 (w, m, 1) \
1054   V4_LOAD_MSG_1 (w, m, 2) \
1055   V4_LOAD_MSG_1 (w, m, 3) \
1056 }
1057 
1058 #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
1059 { \
1060   const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i    ) * 4);  \
1061   const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4);  \
1062   d0 = _mm_unpacklo_epi32(v0, v1);  \
1063   d1 = _mm_unpackhi_epi32(v0, v1);  \
1064 }
1065 
1066 #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
1067 { \
1068   STORE_128_TO_STRUCT((dest32) + i * 4     , _mm_unpacklo_epi64(s0, s1));  \
1069   STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1));  \
1070 }
1071 
1072 #define V4_UNPACK_STATE(dest32, src32) \
1073 { \
1074   __m128i t0, t1, t2, t3, t4, t5, t6, t7; \
1075   V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1)  \
1076   V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3)  \
1077   V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5)  \
1078   V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7)  \
1079   V4_UNPACK_PAIR_128(dest32, 0, t0, t2)  \
1080   V4_UNPACK_PAIR_128(dest32, 8, t1, t3)  \
1081   V4_UNPACK_PAIR_128(dest32, 1, t4, t6)  \
1082   V4_UNPACK_PAIR_128(dest32, 9, t5, t7)  \
1083 }
1084 
1085 
1086 static
1087 Z7_NO_INLINE
1088 #ifdef BLAKE2S_ATTRIB_128BIT
1089        BLAKE2S_ATTRIB_128BIT
1090 #endif
1091 void
1092 Z7_FASTCALL
Blake2sp_Compress2_V128_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1093 Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1094 {
1095   // PrintStates2(s_items, 8, 16);
1096   size_t pos = 0;
1097   pos /= 2;
1098   do
1099   {
1100 #if defined(Z7_BLAKE2S_USE_SSSE3) && \
1101    !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
1102     const __m128i  r8 = k_r8;
1103     const __m128i r16 = k_r16;
1104 #endif
1105     __m128i w[16];
1106     __m128i v[16];
1107     UInt32 *s;
1108     V4_LOAD_MSG(w, data)
1109     s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1110     {
1111       __m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
1112       D_ADD_EPI64_128 (ctr, k_inc);
1113       STORE_128_TO_STRUCT(s + 64, ctr);
1114       v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1115       v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1116     }
1117     v[ 8] = GET_128_IV_WAY4(0);
1118     v[ 9] = GET_128_IV_WAY4(1);
1119     v[10] = GET_128_IV_WAY4(2);
1120     v[11] = GET_128_IV_WAY4(3);
1121     v[14] = GET_128_IV_WAY4(6);
1122     v[15] = GET_128_IV_WAY4(7);
1123 
1124 #define LOAD_STATE_128_FROM_STRUCT(i) \
1125       v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
1126 
1127 #define UPDATE_STATE_128_IN_STRUCT(i) \
1128       STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
1129       XOR_128(v[i], v[(i) + 8]), \
1130       LOAD_128_FROM_STRUCT(s + (i) * 4)));
1131 
1132     REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
1133     ROUNDS_LOOP (V4_ROUND)
1134     REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
1135 
1136     data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1137     pos  += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
1138     pos &= SUPER_BLOCK_SIZE / 2 - 1;
1139   }
1140   while (data != end);
1141 }
1142 
1143 
1144 static
1145 Z7_NO_INLINE
1146 #ifdef BLAKE2S_ATTRIB_128BIT
1147        BLAKE2S_ATTRIB_128BIT
1148 #endif
1149 void
1150 Z7_FASTCALL
Blake2sp_Final_V128_Fast(UInt32 * states)1151 Blake2sp_Final_V128_Fast(UInt32 *states)
1152 {
1153   const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1154   // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
1155   // PrintStates2(states, 8, 16);
1156   {
1157     ptrdiff_t pos = 8 * 4;
1158     do
1159     {
1160       UInt32 *src32  = states + (size_t)(pos * 1);
1161       UInt32 *dest32 = states + (size_t)(pos * 2);
1162       V4_UNPACK_STATE(dest32, src32)
1163       pos -= 8 * 4;
1164     }
1165     while (pos >= 0);
1166   }
1167   {
1168     unsigned k;
1169     for (k = 0; k < 8; k++)
1170     {
1171       UInt32 *s = states + (size_t)k * 16;
1172       STORE_128_TO_STRUCT (STATE_T(s), ctr);
1173     }
1174   }
1175   // PrintStates2(states, 8, 16);
1176 }
1177 
1178 
1179 
1180 #ifdef Z7_BLAKE2S_USE_AVX2
1181 
1182 #define ADD_256(a, b)  _mm256_add_epi32(a, b)
1183 #define XOR_256(a, b)  _mm256_xor_si256(a, b)
1184 
1185 #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
1186   #define MM256_ROR_EPI32  _mm256_ror_epi32
1187   #define Z7_MM256_ROR_EPI32_IS_SUPPORTED
1188 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1189   #define LOAD_ROTATE_CONSTS_256
1190 #endif
1191 #else
1192 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1193 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1194   #define LOAD_ROTATE_CONSTS_256 \
1195       const __m256i  r8 = k_r8_256; \
1196       const __m256i r16 = k_r16_256;
1197 #endif // AVX2_WAY2
1198 
1199   #define MM256_ROR_EPI32(r, c) ( \
1200       ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
1201     : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
1202     : _mm256_or_si256( \
1203       _mm256_srli_epi32((r), (c)), \
1204       _mm256_slli_epi32((r), 32-(c))))
1205 #endif // WAY_SLOW
1206 #endif
1207 
1208 
1209 #define D_ADD_256(dest, src)  dest = ADD_256(dest, src)
1210 #define D_XOR_256(dest, src)  dest = XOR_256(dest, src)
1211 
1212 #define LOADU_256(p)     _mm256_loadu_si256((const __m256i *)(const void *)(p))
1213 
1214 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1215 
1216 #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1217 #define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
1218 #define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
1219 #define ROT_256_8(x)  MM256_ROR_EPI32((x),  8)
1220 #define ROT_256_7(x)  MM256_ROR_EPI32((x),  7)
1221 #else
1222 #define ROTATE8  _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
1223                                  12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
1224 #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
1225                                  13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
1226 #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
1227 #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
1228 #define ROT_256_8(x)  _mm256_shuffle_epi8((x), ROTATE8)
1229 #define ROT_256_7(x)  _mm256_or_si256(_mm256_srli_epi32((x),  7), _mm256_slli_epi32((x), 25))
1230 #endif
1231 
1232 #define D_ROT_256_7(dest)     dest = ROT_256_7(dest)
1233 #define D_ROT_256_8(dest)     dest = ROT_256_8(dest)
1234 #define D_ROT_256_12(dest)    dest = ROT_256_12(dest)
1235 #define D_ROT_256_16(dest)    dest = ROT_256_16(dest)
1236 
1237 #define LOAD_256(p)      _mm256_load_si256((const __m256i *)(const void *)(p))
1238 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
1239   #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
1240   #define LOAD_256_FROM_STRUCT(p)     LOADU_256(p)
1241   #define STORE_256_TO_STRUCT(p, r)   STOREU_256(p, r)
1242 #else
1243   // if struct is aligned for 32-bytes
1244   #define STORE_256(p, r)  _mm256_store_si256((__m256i *)(void *)(p), r)
1245   #define LOAD_256_FROM_STRUCT(p)     LOAD_256(p)
1246   #define STORE_256_TO_STRUCT(p, r)   STORE_256(p, r)
1247 #endif
1248 
1249 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1250 
1251 
1252 
1253 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1254 
1255 #if 0
1256     #define DIAG_PERM2(s) \
1257     { \
1258       const __m256i a = LOAD_256_FROM_STRUCT((s)      ); \
1259       const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
1260       STORE_256_TO_STRUCT((s      ), _mm256_permute2x128_si256(a, b, 0x20)); \
1261       STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
1262     }
1263 #else
1264     #define DIAG_PERM2(s) \
1265     { \
1266       const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
1267       const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
1268       STORE_128_TO_STRUCT((s) + NSW, a); \
1269       STORE_128_TO_STRUCT((s) + 4  , b); \
1270     }
1271 #endif
1272     #define DIAG_PERM8(s_items) \
1273     { \
1274       DIAG_PERM2(s_items) \
1275       DIAG_PERM2(s_items + NSW * 2) \
1276       DIAG_PERM2(s_items + NSW * 4) \
1277       DIAG_PERM2(s_items + NSW * 6) \
1278     }
1279 
1280 
1281 #define AXR256(a, b, d, shift) \
1282     D_ADD_256(a, b); \
1283     D_XOR_256(d, a); \
1284     d = MM256_ROR_EPI32(d, shift); \
1285 
1286 
1287 
1288 #ifdef Z7_BLAKE2S_USE_GATHER
1289 
1290   #define TABLE_GATHER_256_4(a0,a1,a2,a3) \
1291     a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
1292   #define TABLE_GATHER_256( \
1293     a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
1294   { TABLE_GATHER_256_4(a0,a2,a4,a6), \
1295     TABLE_GATHER_256_4(a1,a3,a5,a7), \
1296     TABLE_GATHER_256_4(a8,a10,a12,a14), \
1297     TABLE_GATHER_256_4(a9,a11,a13,a15) }
1298 MY_ALIGN(64)
1299 static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
1300   { SIGMA_TABLE(TABLE_GATHER_256) };
1301   #define GET_SIGMA(r) \
1302     const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
1303   #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1304     const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
1305   #define SET_ROW_FROM_SIGMA_AVX(in) \
1306     _mm256_i32gather_epi32((const void *)(in), i01234567, 4)
1307   #define SIGMA_INTERLEAVE    8
1308   #define SIGMA_HALF_ROW_SIZE 16
1309 
1310 #else // !Z7_BLAKE2S_USE_GATHER
1311 
1312   #define GET_SIGMA(r) \
1313     const Byte * const sigma = k_Blake2s_Sigma_4[r];
1314   #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1315     AXR2_LOAD_INDEXES(sigma_index)
1316   #define SET_ROW_FROM_SIGMA_AVX(in) \
1317     MY_mm256_set_m128i( \
1318         SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
1319         SET_ROW_FROM_SIGMA_W(in))
1320   #define SIGMA_INTERLEAVE    1
1321   #define SIGMA_HALF_ROW_SIZE 8
1322 #endif // !Z7_BLAKE2S_USE_GATHER
1323 
1324 
1325 #define ROTATE_WORDS_TO_RIGHT_256(a, n) \
1326     a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
1327 
1328 
1329 
1330 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1331 
1332 #define AXR2_A(sigma_index, shift1, shift2) \
1333     AXR2_LOAD_INDEXES_AVX(sigma_index) \
1334     D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1335     AXR256(a0, b0, d0, shift1) \
1336     AXR256(c0, d0, b0, shift2) \
1337 
1338 #define AXR4_A(sigma_index) \
1339     { AXR2_A(sigma_index, 16, 12) } \
1340     { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1341 
1342 #define EE1(r) \
1343     { GET_SIGMA(r) \
1344       AXR4_A(0) \
1345         ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1346         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1347         ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1348       AXR4_A(SIGMA_HALF_ROW_SIZE) \
1349         ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1350         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1351         ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1352     }
1353 
1354 static
1355 Z7_NO_INLINE
1356 #ifdef BLAKE2S_ATTRIB_AVX2
1357        BLAKE2S_ATTRIB_AVX2
1358 #endif
1359 void
1360 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way2(UInt32 * s_items,const Byte * data,const Byte * end)1361 Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
1362 {
1363   size_t pos = 0;
1364   end -= Z7_BLAKE2S_BLOCK_SIZE;
1365 
1366   if (data != end)
1367   {
1368     LOAD_ROTATE_CONSTS_256
1369     DIAG_PERM8(s_items)
1370     do
1371     {
1372       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1373       __m256i a0, b0, c0, d0;
1374       {
1375         const __m128i inc = k_inc;
1376         __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1377         __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1378         D_ADD_EPI64_128(d0_128, inc);
1379         D_ADD_EPI64_128(d1_128, inc);
1380         STORE_128_TO_STRUCT (STATE_T(s      ), d0_128);
1381         STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
1382         d0 = MY_mm256_set_m128i(d1_128, d0_128);
1383         D_XOR_256(d0, k_iv4_256);
1384       }
1385       c0 = SET_FROM_128(k_iv0_128);
1386       a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1387       b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1388 
1389       ROUNDS_LOOP (EE1)
1390 
1391       D_XOR_256(a0, c0);
1392       D_XOR_256(b0, d0);
1393 
1394       D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1395       D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1396 
1397       STORE_256_TO_STRUCT(s + NSW * 0, a0);
1398       STORE_256_TO_STRUCT(s + NSW * 1, b0);
1399 
1400       data += Z7_BLAKE2S_BLOCK_SIZE * 2;
1401       pos  += Z7_BLAKE2S_BLOCK_SIZE * 2;
1402       pos &= SUPER_BLOCK_MASK;
1403     }
1404     while (data < end);
1405     DIAG_PERM8(s_items)
1406     if (data != end)
1407       return;
1408   }
1409   {
1410     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1411     Z7_BLAKE2S_CompressSingleBlock(s, data);
1412   }
1413 }
1414 
1415 #endif // Z7_BLAKE2S_USE_AVX2_WAY2
1416 
1417 
1418 
1419 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
1420 
1421 #define AXR2_X(sigma_index, shift1, shift2) \
1422     AXR2_LOAD_INDEXES_AVX(sigma_index) \
1423     D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1424     D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
1425     AXR256(a0, b0, d0, shift1) \
1426     AXR256(a1, b1, d1, shift1) \
1427     AXR256(c0, d0, b0, shift2) \
1428     AXR256(c1, d1, b1, shift2) \
1429 
1430 #define AXR4_X(sigma_index) \
1431     { AXR2_X(sigma_index, 16, 12) } \
1432     { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1433 
1434 #define EE2(r) \
1435     { GET_SIGMA(r) \
1436       AXR4_X(0) \
1437         ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1438         ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
1439         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1440         ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1441         ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1442         ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
1443       AXR4_X(SIGMA_HALF_ROW_SIZE) \
1444         ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1445         ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
1446         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1447         ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1448         ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1449         ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
1450     }
1451 
1452 static
1453 Z7_NO_INLINE
1454 #ifdef BLAKE2S_ATTRIB_AVX2
1455        BLAKE2S_ATTRIB_AVX2
1456 #endif
1457 void
1458 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way4(UInt32 * s_items,const Byte * data,const Byte * end)1459 Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
1460 {
1461   size_t pos = 0;
1462 
1463   if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
1464   {
1465 #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1466     const __m256i  r8 = k_r8_256;
1467     const __m256i r16 = k_r16_256;
1468 #endif
1469     end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
1470     DIAG_PERM8(s_items)
1471     do
1472     {
1473       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1474       __m256i a0, b0, c0, d0;
1475       __m256i a1, b1, c1, d1;
1476       {
1477         const __m128i inc = k_inc;
1478         __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1479         __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1480         __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
1481         __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
1482         D_ADD_EPI64_128(d0_128, inc);
1483         D_ADD_EPI64_128(d1_128, inc);
1484         D_ADD_EPI64_128(d2_128, inc);
1485         D_ADD_EPI64_128(d3_128, inc);
1486         STORE_128_TO_STRUCT (STATE_T(s          ), d0_128);
1487         STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
1488         STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
1489         STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
1490         d0 = MY_mm256_set_m128i(d1_128, d0_128);
1491         d1 = MY_mm256_set_m128i(d3_128, d2_128);
1492         D_XOR_256(d0, k_iv4_256);
1493         D_XOR_256(d1, k_iv4_256);
1494       }
1495       c1 = c0 = SET_FROM_128(k_iv0_128);
1496       a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1497       b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1498       a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
1499       b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
1500 
1501       ROUNDS_LOOP (EE2)
1502 
1503       D_XOR_256(a0, c0);
1504       D_XOR_256(b0, d0);
1505       D_XOR_256(a1, c1);
1506       D_XOR_256(b1, d1);
1507 
1508       D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1509       D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1510       D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
1511       D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
1512 
1513       STORE_256_TO_STRUCT(s + NSW * 0, a0);
1514       STORE_256_TO_STRUCT(s + NSW * 1, b0);
1515       STORE_256_TO_STRUCT(s + NSW * 2, a1);
1516       STORE_256_TO_STRUCT(s + NSW * 3, b1);
1517 
1518       data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1519       pos  += Z7_BLAKE2S_BLOCK_SIZE * 4;
1520       pos &= SUPER_BLOCK_MASK;
1521     }
1522     while (data < end);
1523     DIAG_PERM8(s_items)
1524     end += Z7_BLAKE2S_BLOCK_SIZE * 3;
1525   }
1526   if (data == end)
1527     return;
1528   // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
1529   do
1530   {
1531     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1532     Z7_BLAKE2S_CompressSingleBlock(s, data);
1533     data += Z7_BLAKE2S_BLOCK_SIZE;
1534     pos  += Z7_BLAKE2S_BLOCK_SIZE;
1535     pos &= SUPER_BLOCK_MASK;
1536   }
1537   while (data != end);
1538 }
1539 
1540 #endif // Z7_BLAKE2S_USE_AVX2_WAY4
1541 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1542 
1543 
1544 // ---------------------------------------------------------
1545 
1546 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1547 
1548 #define OP256_L(a, i)   D_ADD_256 (V(a, 0), \
1549     LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
1550 
1551 #define OP256_0(a)   OP256_L(a, 0)
1552 #define OP256_7(a)   OP256_L(a, 1)
1553 
1554 #define OP256_1(a)   D_ADD_256 (V(a, 0), V(a, 1));
1555 #define OP256_2(a)   D_XOR_256 (V(a, 3), V(a, 0));
1556 #define OP256_4(a)   D_ADD_256 (V(a, 2), V(a, 3));
1557 #define OP256_5(a)   D_XOR_256 (V(a, 1), V(a, 2));
1558 
1559 #define OP256_3(a)   D_ROT_256_16 (V(a, 3));
1560 #define OP256_6(a)   D_ROT_256_12 (V(a, 1));
1561 #define OP256_8(a)   D_ROT_256_8  (V(a, 3));
1562 #define OP256_9(a)   D_ROT_256_7  (V(a, 1));
1563 
1564 
1565 #if 0 || 1 && defined(MY_CPU_X86)
1566 
1567 #define V8_G(a) \
1568   OP256_0 (a) \
1569   OP256_1 (a) \
1570   OP256_2 (a) \
1571   OP256_3 (a) \
1572   OP256_4 (a) \
1573   OP256_5 (a) \
1574   OP256_6 (a) \
1575   OP256_7 (a) \
1576   OP256_1 (a) \
1577   OP256_2 (a) \
1578   OP256_8 (a) \
1579   OP256_4 (a) \
1580   OP256_5 (a) \
1581   OP256_9 (a) \
1582 
1583 #define V8R { \
1584   V8_G (0); \
1585   V8_G (1); \
1586   V8_G (2); \
1587   V8_G (3); \
1588   V8_G (4); \
1589   V8_G (5); \
1590   V8_G (6); \
1591   V8_G (7); \
1592 }
1593 
1594 #else
1595 
1596 #define OP256_INTER_4(op, a,b,c,d) \
1597   op (a) \
1598   op (b) \
1599   op (c) \
1600   op (d) \
1601 
1602 #define V8_G(a,b,c,d) \
1603   OP256_INTER_4 (OP256_0, a,b,c,d) \
1604   OP256_INTER_4 (OP256_1, a,b,c,d) \
1605   OP256_INTER_4 (OP256_2, a,b,c,d) \
1606   OP256_INTER_4 (OP256_3, a,b,c,d) \
1607   OP256_INTER_4 (OP256_4, a,b,c,d) \
1608   OP256_INTER_4 (OP256_5, a,b,c,d) \
1609   OP256_INTER_4 (OP256_6, a,b,c,d) \
1610   OP256_INTER_4 (OP256_7, a,b,c,d) \
1611   OP256_INTER_4 (OP256_1, a,b,c,d) \
1612   OP256_INTER_4 (OP256_2, a,b,c,d) \
1613   OP256_INTER_4 (OP256_8, a,b,c,d) \
1614   OP256_INTER_4 (OP256_4, a,b,c,d) \
1615   OP256_INTER_4 (OP256_5, a,b,c,d) \
1616   OP256_INTER_4 (OP256_9, a,b,c,d) \
1617 
1618 #define V8R { \
1619   V8_G (0, 1, 2, 3) \
1620   V8_G (4, 5, 6, 7) \
1621 }
1622 #endif
1623 
1624 #define V8_ROUND(r)  { GET_SIGMA_PTR_256(r); V8R }
1625 
1626 
1627 // for debug:
1628 // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
1629 #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
1630 // gather instruction is slow.
1631 #define V8_LOAD_MSG(w, m) \
1632 { \
1633   unsigned i; \
1634   for (i = 0; i < 16; ++i) { \
1635     w[i] = _mm256_i32gather_epi32( \
1636       (const void *)((m) + i * sizeof(UInt32)),\
1637       _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
1638       sizeof(UInt32)); \
1639   } \
1640 }
1641 #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1642 
1643 #define V8_LOAD_MSG_2(w, a0, a1) \
1644 { \
1645   (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20);  \
1646   (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31);  \
1647 }
1648 
1649 #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
1650 { \
1651   __m256i s0, s1, s2, s3;  \
1652   s0 = _mm256_unpacklo_epi64(z0, z1);  \
1653   s1 = _mm256_unpackhi_epi64(z0, z1);  \
1654   s2 = _mm256_unpacklo_epi64(z2, z3);  \
1655   s3 = _mm256_unpackhi_epi64(z2, z3);  \
1656   V8_LOAD_MSG_2((w) + 0, s0, s2)   \
1657   V8_LOAD_MSG_2((w) + 1, s1, s3)   \
1658 }
1659 
1660 #define V8_LOAD_MSG_0(t0, t1, m) \
1661 { \
1662   __m256i m0, m1;  \
1663   m0 = LOADU_256(m);  \
1664   m1 = LOADU_256((m) + 2 * 32);  \
1665   t0 = _mm256_unpacklo_epi32(m0, m1);  \
1666   t1 = _mm256_unpackhi_epi32(m0, m1);  \
1667 }
1668 
1669 #define V8_LOAD_MSG_8(w, m) \
1670 { \
1671   __m256i t0, t1, t2, t3, t4, t5, t6, t7;  \
1672   V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32)  \
1673   V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32)  \
1674   V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32)  \
1675   V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32)  \
1676   V8_LOAD_MSG_4((w)    , t0, t1, t2, t3)   \
1677   V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7)   \
1678 }
1679 
1680 #define V8_LOAD_MSG(w, m) \
1681 { \
1682   V8_LOAD_MSG_8(w, m)  \
1683   V8_LOAD_MSG_8((w) + 8, (m) + 32)  \
1684 }
1685 
1686 #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1687 
1688 
1689 #define V8_PERM_PAIR_STORE(u, a0, a2) \
1690 { \
1691   STORE_256_TO_STRUCT((u),     _mm256_permute2x128_si256(a0, a2, 0x20));  \
1692   STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31));  \
1693 }
1694 
1695 #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
1696 { \
1697   __m256i s0, s1, s2, s3;  \
1698   s0 = _mm256_unpacklo_epi64(z0, z1);  \
1699   s1 = _mm256_unpackhi_epi64(z0, z1);  \
1700   s2 = _mm256_unpacklo_epi64(z2, z3);  \
1701   s3 = _mm256_unpackhi_epi64(z2, z3);  \
1702   V8_PERM_PAIR_STORE(u + 0, s0, s2)  \
1703   V8_PERM_PAIR_STORE(u + 2, s1, s3)  \
1704 }
1705 
1706 #define V8_UNPACK_STORE_0(src32, d0, d1) \
1707 { \
1708   const __m256i v0 = LOAD_256_FROM_STRUCT ((src32)    );  \
1709   const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8);  \
1710   d0 = _mm256_unpacklo_epi32(v0, v1);  \
1711   d1 = _mm256_unpackhi_epi32(v0, v1);  \
1712 }
1713 
1714 #define V8_UNPACK_STATE(dest32, src32) \
1715 { \
1716   __m256i t0, t1, t2, t3, t4, t5, t6, t7;  \
1717   V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4)  \
1718   V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5)  \
1719   V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6)  \
1720   V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7)  \
1721   V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32)    , t0, t1, t2, t3)  \
1722   V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7)  \
1723 }
1724 
1725 
1726 
1727 #define V8_LOAD_STATE_256_FROM_STRUCT(i) \
1728       v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
1729 
1730 #if 0 || 0 && defined(MY_CPU_X86)
1731 #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1732 #endif
1733 
1734 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1735 // this branch doesn't use (iv) array
1736 // so register pressure can be lower.
1737 // it can be faster sometimes
1738 #define V8_LOAD_STATE_256(i)  V8_LOAD_STATE_256_FROM_STRUCT(i)
1739 #define V8_UPDATE_STATE_256(i) \
1740 { \
1741     STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
1742     XOR_256(v[i], v[(i) + 8]), \
1743     LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
1744 }
1745 #else
1746 // it uses more variables (iv) registers
1747 // it's better for gcc
1748 // maybe that branch is better, if register pressure will be lower (avx512)
1749 #define V8_LOAD_STATE_256(i)   { iv[i] = v[i]; }
1750 #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
1751 #define V8_STORE_STATE_256(i)  { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
1752 #endif
1753 
1754 
1755 #if 0
1756   // use loading constants from memory
1757   #define KK8(n)  KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
1758 MY_ALIGN(64)
1759 static const UInt32 k_Blake2s_IV_WAY8[]=
1760 {
1761   KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
1762 };
1763   #define GET_256_IV_WAY8(i)  LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
1764 #else
1765   // use constant generation:
1766   #define GET_256_IV_WAY8(i)  _mm256_set1_epi32((Int32)KIV(i))
1767 #endif
1768 
1769 
1770 static
1771 Z7_NO_INLINE
1772 #ifdef BLAKE2S_ATTRIB_AVX2
1773        BLAKE2S_ATTRIB_AVX2
1774 #endif
1775 void
1776 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1777 Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1778 {
1779 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1780   __m256i v[16];
1781 #endif
1782 
1783   // PrintStates2(s_items, 8, 16);
1784 
1785 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1786   REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
1787 #endif
1788 
1789   do
1790   {
1791     __m256i w[16];
1792 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1793     __m256i v[16];
1794 #else
1795     __m256i iv[8];
1796 #endif
1797     V8_LOAD_MSG(w, data)
1798     {
1799       // we use load/store ctr inside loop to reduce register pressure:
1800 #if 1 || 1 && defined(MY_CPU_X86)
1801       const __m256i ctr = _mm256_add_epi64(
1802           LOAD_256_FROM_STRUCT(s_items + 64),
1803           _mm256_set_epi32(
1804               0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
1805               0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
1806       STORE_256_TO_STRUCT(s_items + 64, ctr);
1807 #else
1808       const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
1809           + Z7_BLAKE2S_BLOCK_SIZE;
1810       const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
1811       *(UInt64 *)(void *)(s_items + 64) = ctr64;
1812 #endif
1813       v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1814       v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1815     }
1816     v[ 8] = GET_256_IV_WAY8(0);
1817     v[ 9] = GET_256_IV_WAY8(1);
1818     v[10] = GET_256_IV_WAY8(2);
1819     v[11] = GET_256_IV_WAY8(3);
1820     v[14] = GET_256_IV_WAY8(6);
1821     v[15] = GET_256_IV_WAY8(7);
1822 
1823     REP8_MACRO (V8_LOAD_STATE_256)
1824     ROUNDS_LOOP (V8_ROUND)
1825     REP8_MACRO (V8_UPDATE_STATE_256)
1826     data += SUPER_BLOCK_SIZE;
1827   }
1828   while (data != end);
1829 
1830 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1831   REP8_MACRO (V8_STORE_STATE_256)
1832 #endif
1833 }
1834 
1835 
1836 static
1837 Z7_NO_INLINE
1838 #ifdef BLAKE2S_ATTRIB_AVX2
1839        BLAKE2S_ATTRIB_AVX2
1840 #endif
1841 void
1842 Z7_FASTCALL
Blake2sp_Final_AVX2_Fast(UInt32 * states)1843 Blake2sp_Final_AVX2_Fast(UInt32 *states)
1844 {
1845   const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1846   // PrintStates2(states, 8, 16);
1847   V8_UNPACK_STATE(states, states)
1848   // PrintStates2(states, 8, 16);
1849   {
1850     unsigned k;
1851     for (k = 0; k < 8; k++)
1852     {
1853       UInt32 *s = states + (size_t)k * 16;
1854       STORE_128_TO_STRUCT (STATE_T(s), ctr);
1855     }
1856   }
1857   // PrintStates2(states, 8, 16);
1858   // printf("\nafter V8_UNPACK_STATE \n");
1859 }
1860 
1861 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1862 #endif // avx2
1863 #endif // vector
1864 
1865 
1866 /*
1867 #define Blake2s_Increment_Counter(s, inc) \
1868   { STATE_T(s)[0] += (inc);  STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
1869 #define Blake2s_Increment_Counter_Small(s, inc) \
1870   { STATE_T(s)[0] += (inc); }
1871 */
1872 
1873 #define Blake2s_Set_LastBlock(s) \
1874   { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
1875 
1876 
1877 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
1878   // good for vs2022
1879   #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
1880 #else
1881    // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
1882   #define LOOP_8(mac) { REP8_MACRO(mac) }
1883 #endif
1884 
1885 
1886 static
1887 Z7_FORCE_INLINE
1888 // Z7_NO_INLINE
1889 void
1890 Z7_FASTCALL
Blake2s_Compress(UInt32 * s,const Byte * input)1891 Blake2s_Compress(UInt32 *s, const Byte *input)
1892 {
1893   UInt32 m[16];
1894   UInt32 v[16];
1895   {
1896     unsigned i;
1897     for (i = 0; i < 16; i++)
1898       m[i] = GetUi32(input + i * 4);
1899   }
1900 
1901 #define INIT_v_FROM_s(i)  v[i] = s[i];
1902 
1903   LOOP_8(INIT_v_FROM_s)
1904 
1905   // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
1906   {
1907     const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
1908     const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
1909     STATE_T(s)[0] = t0;
1910     STATE_T(s)[1] = t1;
1911     v[12] = t0 ^ KIV(4);
1912     v[13] = t1 ^ KIV(5);
1913   }
1914   // v[12] = STATE_T(s)[0] ^ KIV(4);
1915   // v[13] = STATE_T(s)[1] ^ KIV(5);
1916   v[14] = STATE_F(s)[0] ^ KIV(6);
1917   v[15] = STATE_F(s)[1] ^ KIV(7);
1918 
1919   v[ 8] = KIV(0);
1920   v[ 9] = KIV(1);
1921   v[10] = KIV(2);
1922   v[11] = KIV(3);
1923   // PrintStates2((const UInt32 *)v, 1, 16);
1924 
1925   #define ADD_SIGMA(a, index)  V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
1926   #define ADD32M(dest, src, a)    V(a, dest) += V(a, src);
1927   #define XOR32M(dest, src, a)    V(a, dest) ^= V(a, src);
1928   #define RTR32M(dest, shift, a)  V(a, dest) = rotrFixed(V(a, dest), shift);
1929 
1930 // big interleaving can provides big performance gain, if scheduler queues are small.
1931 #if 0 || 1 && defined(MY_CPU_X86)
1932   // interleave-1: for small register number (x86-32bit)
1933   #define G2(index, a, x, y) \
1934     ADD_SIGMA (a, (index) + 2 * 0) \
1935     ADD32M (0, 1, a) \
1936     XOR32M (3, 0, a) \
1937     RTR32M (3, x, a) \
1938     ADD32M (2, 3, a) \
1939     XOR32M (1, 2, a) \
1940     RTR32M (1, y, a) \
1941 
1942   #define G(a) \
1943     G2(a * 2    , a, 16, 12) \
1944     G2(a * 2 + 1, a,  8,  7) \
1945 
1946   #define R2 \
1947     G(0) \
1948     G(1) \
1949     G(2) \
1950     G(3) \
1951     G(4) \
1952     G(5) \
1953     G(6) \
1954     G(7) \
1955 
1956 #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
1957   // interleave-2: is good if the number of registers is not big (x86-64).
1958 
1959   #define REP2(mac, dest, src, a, b) \
1960       mac(dest, src, a) \
1961       mac(dest, src, b)
1962 
1963   #define G2(index, a, b, x, y) \
1964     ADD_SIGMA (a, (index) + 2 * 0) \
1965     ADD_SIGMA (b, (index) + 2 * 1) \
1966     REP2 (ADD32M, 0, 1, a, b) \
1967     REP2 (XOR32M, 3, 0, a, b) \
1968     REP2 (RTR32M, 3, x, a, b) \
1969     REP2 (ADD32M, 2, 3, a, b) \
1970     REP2 (XOR32M, 1, 2, a, b) \
1971     REP2 (RTR32M, 1, y, a, b) \
1972 
1973   #define G(a, b) \
1974     G2(a * 2    , a, b, 16, 12) \
1975     G2(a * 2 + 1, a, b,  8,  7) \
1976 
1977   #define R2 \
1978     G(0, 1) \
1979     G(2, 3) \
1980     G(4, 5) \
1981     G(6, 7) \
1982 
1983 #else
1984   // interleave-4:
1985   // it has big register pressure for x86/x64.
1986   // and MSVC compilers for x86/x64 are slow for this branch.
1987   // but if we have big number of registers, this branch can be faster.
1988 
1989   #define REP4(mac, dest, src, a, b, c, d) \
1990       mac(dest, src, a) \
1991       mac(dest, src, b) \
1992       mac(dest, src, c) \
1993       mac(dest, src, d)
1994 
1995   #define G2(index, a, b, c, d, x, y) \
1996     ADD_SIGMA (a, (index) + 2 * 0) \
1997     ADD_SIGMA (b, (index) + 2 * 1) \
1998     ADD_SIGMA (c, (index) + 2 * 2) \
1999     ADD_SIGMA (d, (index) + 2 * 3) \
2000     REP4 (ADD32M, 0, 1, a, b, c, d) \
2001     REP4 (XOR32M, 3, 0, a, b, c, d) \
2002     REP4 (RTR32M, 3, x, a, b, c, d) \
2003     REP4 (ADD32M, 2, 3, a, b, c, d) \
2004     REP4 (XOR32M, 1, 2, a, b, c, d) \
2005     REP4 (RTR32M, 1, y, a, b, c, d) \
2006 
2007   #define G(a, b, c, d) \
2008     G2(a * 2    , a, b, c, d, 16, 12) \
2009     G2(a * 2 + 1, a, b, c, d,  8,  7) \
2010 
2011   #define R2 \
2012     G(0, 1, 2, 3) \
2013     G(4, 5, 6, 7) \
2014 
2015 #endif
2016 
2017   #define R(r)  { const Byte *sigma = k_Blake2s_Sigma_4[r];  R2 }
2018 
2019   // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
2020   //   20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
2021   //   30-60% faster for (arm64-arm32) GCC.
2022   //    5-11% faster for (arm64) CLANG-MAC.
2023   // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
2024   // But if there is vectors branch (for x86*), this scalar code will be unused mostly.
2025   // So we want smaller code (without unrolling) in that case (x86*).
2026 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
2027   #define Z7_BLAKE2S_UNROLL
2028 #endif
2029 
2030 #ifdef Z7_BLAKE2S_UNROLL
2031     ROUNDS_LOOP_UNROLLED (R)
2032 #else
2033     ROUNDS_LOOP (R)
2034 #endif
2035 
2036   #undef G
2037   #undef G2
2038   #undef R
2039   #undef R2
2040 
2041   // printf("\n v after: \n");
2042   // PrintStates2((const UInt32 *)v, 1, 16);
2043 #define XOR_s_PAIR_v(i)  s[i] ^= v[i] ^ v[i + 8];
2044 
2045   LOOP_8(XOR_s_PAIR_v)
2046   // printf("\n s after:\n");
2047   // PrintStates2((const UInt32 *)s, 1, 16);
2048 }
2049 
2050 
2051 static
2052 Z7_NO_INLINE
2053 void
2054 Z7_FASTCALL
Blake2sp_Compress2(UInt32 * s_items,const Byte * data,const Byte * end)2055 Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
2056 {
2057   size_t pos = 0;
2058   // PrintStates2(s_items, 8, 16);
2059   do
2060   {
2061     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
2062     Blake2s_Compress(s, data);
2063     data += Z7_BLAKE2S_BLOCK_SIZE;
2064     pos  += Z7_BLAKE2S_BLOCK_SIZE;
2065     pos &= SUPER_BLOCK_MASK;
2066   }
2067   while (data != end);
2068 }
2069 
2070 
2071 #ifdef Z7_BLAKE2S_USE_VECTORS
2072 
2073 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast   = Blake2sp_Compress2;
2074 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
2075 static Z7_BLAKE2SP_FUNC_INIT     g_Z7_BLAKE2SP_FUNC_INIT_Init;
2076 static Z7_BLAKE2SP_FUNC_INIT     g_Z7_BLAKE2SP_FUNC_INIT_Final;
2077 static unsigned g_z7_Blake2sp_SupportedFlags;
2078 
2079   #define Z7_BLAKE2SP_Compress_Fast(p)   (p)->u.header.func_Compress_Fast
2080   #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
2081 #else
2082   #define Z7_BLAKE2SP_Compress_Fast(p)   Blake2sp_Compress2
2083   #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
2084 #endif // Z7_BLAKE2S_USE_VECTORS
2085 
2086 
2087 #if 1 && defined(MY_CPU_LE)
2088     #define GET_DIGEST(_s, _digest) \
2089       { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
2090 #else
2091     #define GET_DIGEST(_s, _digest) \
2092     { unsigned _i; for (_i = 0; _i < 8; _i++) \
2093         { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
2094     }
2095 #endif
2096 
2097 
2098 /* ---------- BLAKE2s ---------- */
2099 /*
2100 // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
2101 typedef struct
2102 {
2103   Byte  digest_length;
2104   Byte  key_length;
2105   Byte  fanout;               // = 1 : in sequential mode
2106   Byte  depth;                // = 1 : in sequential mode
2107   UInt32 leaf_length;
2108   Byte  node_offset[6];       // 0 for the first, leftmost, leaf, or in sequential mode
2109   Byte  node_depth;           // 0 for the leaves, or in sequential mode
2110   Byte  inner_length;         // [0, 32], 0 in sequential mode
2111   Byte  salt[BLAKE2S_SALTBYTES];
2112   Byte  personal[BLAKE2S_PERSONALBYTES];
2113 } CBlake2sParam;
2114 */
2115 
2116 #define k_Blake2sp_IV_0  \
2117     (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
2118 #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth)  \
2119     (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
2120 
2121 Z7_FORCE_INLINE
Blake2sp_Init_Spec(UInt32 * s,unsigned node_offset,unsigned node_depth)2122 static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
2123 {
2124   s[0] = k_Blake2sp_IV_0;
2125   s[1] = KIV(1);
2126   s[2] = KIV(2) ^ (UInt32)node_offset;
2127   s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
2128   s[4] = KIV(4);
2129   s[5] = KIV(5);
2130   s[6] = KIV(6);
2131   s[7] = KIV(7);
2132 
2133   STATE_T(s)[0] = 0;
2134   STATE_T(s)[1] = 0;
2135   STATE_F(s)[0] = 0;
2136   STATE_F(s)[1] = 0;
2137 }
2138 
2139 
2140 #ifdef Z7_BLAKE2S_USE_V128_FAST
2141 
2142 static
2143 Z7_NO_INLINE
2144 #ifdef BLAKE2S_ATTRIB_128BIT
2145        BLAKE2S_ATTRIB_128BIT
2146 #endif
2147 void
2148 Z7_FASTCALL
Blake2sp_InitState_V128_Fast(UInt32 * states)2149 Blake2sp_InitState_V128_Fast(UInt32 *states)
2150 {
2151 #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
2152   { STORE_128_TO_STRUCT(states +  0 + 4 * (i), (t0)); \
2153     STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
2154   }
2155 #define STORE_128_PAIR_INIT_STATES_1(i, mac) \
2156   { const __m128i t = mac; \
2157     STORE_128_PAIR_INIT_STATES_2(i, t, t) \
2158   }
2159 #define STORE_128_PAIR_INIT_STATES_IV(i) \
2160     STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
2161 
2162   STORE_128_PAIR_INIT_STATES_1  (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
2163   STORE_128_PAIR_INIT_STATES_IV (1)
2164   {
2165     const __m128i t = GET_128_IV_WAY4(2);
2166     STORE_128_PAIR_INIT_STATES_2 (2,
2167         XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
2168         XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
2169   }
2170   STORE_128_PAIR_INIT_STATES_1  (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2171   STORE_128_PAIR_INIT_STATES_IV (4)
2172   STORE_128_PAIR_INIT_STATES_IV (5)
2173   STORE_128_PAIR_INIT_STATES_IV (6)
2174   STORE_128_PAIR_INIT_STATES_IV (7)
2175   STORE_128_PAIR_INIT_STATES_1  (16, _mm_set_epi32(0, 0, 0, 0))
2176   // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
2177 }
2178 
2179 #endif // Z7_BLAKE2S_USE_V128_FAST
2180 
2181 
2182 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2183 
2184 static
2185 Z7_NO_INLINE
2186 #ifdef BLAKE2S_ATTRIB_AVX2
2187        BLAKE2S_ATTRIB_AVX2
2188 #endif
2189 void
2190 Z7_FASTCALL
Blake2sp_InitState_AVX2_Fast(UInt32 * states)2191 Blake2sp_InitState_AVX2_Fast(UInt32 *states)
2192 {
2193 #define STORE_256_INIT_STATES(i, t) \
2194     STORE_256_TO_STRUCT(states + 8 * (i), t);
2195 #define STORE_256_INIT_STATES_IV(i) \
2196     STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
2197 
2198   STORE_256_INIT_STATES    (0,  _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
2199   STORE_256_INIT_STATES_IV (1)
2200   STORE_256_INIT_STATES    (2, XOR_256( GET_256_IV_WAY8(2),
2201                                 _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
2202   STORE_256_INIT_STATES    (3,  _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2203   STORE_256_INIT_STATES_IV (4)
2204   STORE_256_INIT_STATES_IV (5)
2205   STORE_256_INIT_STATES_IV (6)
2206   STORE_256_INIT_STATES_IV (7)
2207   STORE_256_INIT_STATES    (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
2208   // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
2209 }
2210 
2211 #endif // Z7_BLAKE2S_USE_AVX2_FAST
2212 
2213 
2214 
2215 Z7_NO_INLINE
Blake2sp_InitState(CBlake2sp * p)2216 void Blake2sp_InitState(CBlake2sp *p)
2217 {
2218   size_t i;
2219   // memset(p->states, 0, sizeof(p->states)); // for debug
2220   p->u.header.cycPos = 0;
2221 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2222   if (p->u.header.func_Init)
2223   {
2224     p->u.header.func_Init(p->states);
2225     return;
2226   }
2227 #endif
2228   for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
2229     Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
2230 }
2231 
Blake2sp_Init(CBlake2sp * p)2232 void Blake2sp_Init(CBlake2sp *p)
2233 {
2234 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2235   p->u.header.func_Compress_Fast =
2236 #ifdef Z7_BLAKE2S_USE_VECTORS
2237     g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2238 #else
2239     NULL;
2240 #endif
2241 
2242   p->u.header.func_Compress_Single =
2243 #ifdef Z7_BLAKE2S_USE_VECTORS
2244     g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2245 #else
2246     NULL;
2247 #endif
2248 
2249   p->u.header.func_Init =
2250 #ifdef Z7_BLAKE2S_USE_VECTORS
2251     g_Z7_BLAKE2SP_FUNC_INIT_Init;
2252 #else
2253     NULL;
2254 #endif
2255 
2256   p->u.header.func_Final =
2257 #ifdef Z7_BLAKE2S_USE_VECTORS
2258     g_Z7_BLAKE2SP_FUNC_INIT_Final;
2259 #else
2260     NULL;
2261 #endif
2262 #endif
2263 
2264   Blake2sp_InitState(p);
2265 }
2266 
2267 
Blake2sp_Update(CBlake2sp * p,const Byte * data,size_t size)2268 void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
2269 {
2270   size_t pos;
2271   // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
2272   if (size == 0)
2273     return;
2274   pos = p->u.header.cycPos;
2275   // pos <  SUPER_BLOCK_SIZE * 2  : is expected
2276   // pos == SUPER_BLOCK_SIZE * 2  : is not expected, but is supported also
2277   {
2278     const size_t pos2 = pos & SUPER_BLOCK_MASK;
2279     if (pos2)
2280     {
2281       const size_t rem = SUPER_BLOCK_SIZE - pos2;
2282       if (rem > size)
2283       {
2284         p->u.header.cycPos = (unsigned)(pos + size);
2285         // cycPos < SUPER_BLOCK_SIZE * 2
2286         memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2287         /* to simpilify the code here we don't try to process first superblock,
2288            if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
2289         return;
2290       }
2291       // (rem <= size)
2292       memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
2293       pos += rem;
2294       data += rem;
2295       size -= rem;
2296     }
2297   }
2298 
2299   // pos <= SUPER_BLOCK_SIZE * 2
2300   // pos  % SUPER_BLOCK_SIZE == 0
2301   if (pos)
2302   {
2303     /* pos == SUPER_BLOCK_SIZE ||
2304        pos == SUPER_BLOCK_SIZE * 2 */
2305     size_t end = pos;
2306     if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
2307         || (end -= SUPER_BLOCK_SIZE))
2308     {
2309       Z7_BLAKE2SP_Compress_Fast(p)(p->states,
2310           (const Byte *)(const void *)p->buf32,
2311           (const Byte *)(const void *)p->buf32 + end);
2312       if (pos -= end)
2313         memcpy(p->buf32, (const Byte *)(const void *)p->buf32
2314             + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
2315     }
2316   }
2317 
2318   // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
2319   if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2320   {
2321     // pos == 0
2322     const Byte *end;
2323     const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
2324         & ~(size_t)SUPER_BLOCK_MASK;
2325     size -= size2;
2326     // size < SUPER_BLOCK_SIZE * 2
2327     end = data + size2;
2328     Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
2329     data = end;
2330   }
2331 
2332   if (size != 0)
2333   {
2334     memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2335     pos += size;
2336   }
2337   p->u.header.cycPos = (unsigned)pos;
2338   // cycPos < SUPER_BLOCK_SIZE * 2
2339 }
2340 
2341 
Blake2sp_Final(CBlake2sp * p,Byte * digest)2342 void Blake2sp_Final(CBlake2sp *p, Byte *digest)
2343 {
2344   // UInt32 * const R_states = p->states;
2345   // printf("\nBlake2sp_Final \n");
2346 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2347   if (p->u.header.func_Final)
2348       p->u.header.func_Final(p->states);
2349 #endif
2350   // printf("\n=====\nBlake2sp_Final \n");
2351   // PrintStates(p->states, 32);
2352 
2353   // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
2354   if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
2355   {
2356     unsigned pos;
2357     memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
2358         0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
2359     STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2360     for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2361     {
2362       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2363       Blake2s_Set_LastBlock(s)
2364       if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
2365       {
2366         UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
2367         if (pos < p->u.header.cycPos)
2368           delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
2369         // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
2370         {
2371           const UInt32 v = STATE_T(s)[0];
2372           STATE_T(s)[1] -= v < delta; //  (v < delta) is same condition here as (v == 0)
2373           STATE_T(s)[0]  = v - delta;
2374         }
2375       }
2376     }
2377     // PrintStates(p->states, 16);
2378     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2379         (Byte *)(void *)p->buf32,
2380         (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2381     // PrintStates(p->states, 16);
2382   }
2383   else
2384   {
2385     // (p->u.header.cycPos > SUPER_BLOCK_SIZE)
2386     unsigned pos;
2387     for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2388     {
2389       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2390       if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
2391         Blake2s_Set_LastBlock(s)
2392     }
2393     if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2394       STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2395 
2396     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2397         (Byte *)(void *)p->buf32,
2398         (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2399 
2400     // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
2401       STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2402 
2403     // if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
2404     {
2405       pos = SUPER_BLOCK_SIZE;
2406       for (;;)
2407       {
2408         UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
2409         Blake2s_Set_LastBlock(s)
2410         pos += Z7_BLAKE2S_BLOCK_SIZE;
2411         if (pos >= p->u.header.cycPos)
2412         {
2413           if (pos != p->u.header.cycPos)
2414           {
2415             const UInt32 delta = pos - p->u.header.cycPos;
2416             const UInt32 v = STATE_T(s)[0];
2417             STATE_T(s)[1] -= v < delta;
2418             STATE_T(s)[0]  = v - delta;
2419             memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
2420           }
2421           break;
2422         }
2423       }
2424       Z7_BLAKE2SP_Compress_Single(p)(p->states,
2425           (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
2426           (Byte *)(void *)p->buf32 + pos);
2427     }
2428   }
2429 
2430   {
2431     size_t pos;
2432     for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
2433     {
2434       const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
2435       Byte *dest = (Byte *)(void *)p->buf32 + pos;
2436       GET_DIGEST(s, dest)
2437     }
2438   }
2439   Blake2sp_Init_Spec(p->states, 0, 1);
2440   {
2441     size_t pos;
2442     for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
2443         - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2444     {
2445       Z7_BLAKE2SP_Compress_Single(p)(p->states,
2446           (const Byte *)(const void *)p->buf32 + pos,
2447           (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
2448     }
2449   }
2450   // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
2451   Blake2s_Set_LastBlock(p->states)
2452   STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
2453   {
2454     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2455         (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
2456         (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
2457   }
2458   GET_DIGEST(p->states, digest)
2459   // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
2460 }
2461 
2462 
Blake2sp_SetFunction(CBlake2sp * p,unsigned algo)2463 BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
2464 {
2465   // printf("\n========== setfunction = %d ======== \n",  algo);
2466 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2467   Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
2468   Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
2469   Z7_BLAKE2SP_FUNC_INIT     func_Final = NULL;
2470   Z7_BLAKE2SP_FUNC_INIT     func_Init = NULL;
2471 #else
2472   UNUSED_VAR(p)
2473 #endif
2474 
2475 #ifdef Z7_BLAKE2S_USE_VECTORS
2476 
2477   func = func_Single = Blake2sp_Compress2;
2478 
2479   if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
2480   {
2481     // printf("\n========== setfunction NON-SCALER ======== \n");
2482     if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
2483     {
2484       func        = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2485       func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2486       func_Init   = g_Z7_BLAKE2SP_FUNC_INIT_Init;
2487       func_Final  = g_Z7_BLAKE2SP_FUNC_INIT_Final;
2488     }
2489     else
2490     {
2491       if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
2492         return False;
2493 
2494 #ifdef Z7_BLAKE2S_USE_AVX2
2495 
2496       func_Single =
2497 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2498         Blake2sp_Compress2_AVX2_Way2;
2499 #else
2500         Z7_BLAKE2S_Compress2_V128;
2501 #endif
2502 
2503 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2504       if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
2505       {
2506         func = Blake2sp_Compress2_AVX2_Fast;
2507         func_Final = Blake2sp_Final_AVX2_Fast;
2508         func_Init  = Blake2sp_InitState_AVX2_Fast;
2509       }
2510       else
2511 #endif
2512 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2513       if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
2514         func = Blake2sp_Compress2_AVX2_Way2;
2515       else
2516 #endif
2517 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2518       if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
2519       {
2520         func_Single = func = Blake2sp_Compress2_AVX2_Way4;
2521       }
2522       else
2523 #endif
2524 #endif // avx2
2525       {
2526         if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
2527         {
2528           func       = Blake2sp_Compress2_V128_Fast;
2529           func_Final = Blake2sp_Final_V128_Fast;
2530           func_Init  = Blake2sp_InitState_V128_Fast;
2531           func_Single = Z7_BLAKE2S_Compress2_V128;
2532         }
2533         else
2534 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2535         if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
2536           func = func_Single = Blake2sp_Compress2_V128_Way2;
2537         else
2538 #endif
2539         {
2540           if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
2541             return False;
2542           func = func_Single = Blake2sp_Compress2_V128_Way1;
2543         }
2544       }
2545     }
2546   }
2547 #else // !VECTORS
2548   if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
2549     return False;
2550 #endif // !VECTORS
2551 
2552 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2553   p->u.header.func_Compress_Fast = func;
2554   p->u.header.func_Compress_Single = func_Single;
2555   p->u.header.func_Final = func_Final;
2556   p->u.header.func_Init = func_Init;
2557 #endif
2558   // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
2559   return True;
2560 }
2561 
2562 
z7_Black2sp_Prepare(void)2563 void z7_Black2sp_Prepare(void)
2564 {
2565 #ifdef Z7_BLAKE2S_USE_VECTORS
2566   unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
2567 
2568   Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
2569   Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
2570   Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2571   Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2572 
2573 #if defined(MY_CPU_X86_OR_AMD64)
2574     #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2575       // optional check
2576       #if 0 || !(defined(__AVX512F__) && defined(__AVX512VL__))
2577       if (CPU_IsSupported_AVX512F_AVX512VL())
2578       #endif
2579     #elif defined(Z7_BLAKE2S_USE_SSE41)
2580       if (CPU_IsSupported_SSE41())
2581     #elif defined(Z7_BLAKE2S_USE_SSSE3)
2582       if (CPU_IsSupported_SSSE3())
2583     #elif !defined(MY_CPU_AMD64)
2584       if (CPU_IsSupported_SSE2())
2585     #endif
2586 #endif
2587   {
2588     #if defined(Z7_BLAKE2S_USE_SSE41)
2589       // printf("\n========== Blake2s SSE41 128-bit\n");
2590     #elif defined(Z7_BLAKE2S_USE_SSSE3)
2591       // printf("\n========== Blake2s SSSE3 128-bit\n");
2592     #else
2593       // printf("\n========== Blake2s SSE2 128-bit\n");
2594     #endif
2595     // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
2596     // printf("\n========== Blake2sp_Compress2_V128_Way2\n");
2597     func_Fast   =
2598     func_Single = Z7_BLAKE2S_Compress2_V128;
2599     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
2600 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2601     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
2602 #endif
2603 #ifdef Z7_BLAKE2S_USE_V128_FAST
2604     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
2605     func_Fast  = Blake2sp_Compress2_V128_Fast;
2606     func_Init  = Blake2sp_InitState_V128_Fast;
2607     func_Final = Blake2sp_Final_V128_Fast;
2608 #endif
2609 
2610 #ifdef Z7_BLAKE2S_USE_AVX2
2611 #if defined(MY_CPU_X86_OR_AMD64)
2612 
2613     #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2614       #if 0
2615         if (CPU_IsSupported_AVX512F_AVX512VL())
2616       #endif
2617     #else
2618         if (CPU_IsSupported_AVX2())
2619     #endif
2620 #endif
2621     {
2622     // #pragma message ("=== Blake2s AVX2")
2623     // printf("\n========== Blake2s AVX2\n");
2624 
2625 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2626       func_Single = Blake2sp_Compress2_AVX2_Way2;
2627       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
2628 #endif
2629 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2630       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
2631 #endif
2632 
2633 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2634       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
2635       func_Fast  = Blake2sp_Compress2_AVX2_Fast;
2636       func_Init  = Blake2sp_InitState_AVX2_Fast;
2637       func_Final = Blake2sp_Final_AVX2_Fast;
2638 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
2639       func_Fast  = Blake2sp_Compress2_AVX2_Way4;
2640 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2641       func_Fast  = Blake2sp_Compress2_AVX2_Way2;
2642 #endif
2643     } // avx2
2644 #endif // avx2
2645   } // sse*
2646   g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast   = func_Fast;
2647   g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
2648   g_Z7_BLAKE2SP_FUNC_INIT_Init       = func_Init;
2649   g_Z7_BLAKE2SP_FUNC_INIT_Final      = func_Final;
2650   g_z7_Blake2sp_SupportedFlags = flags;
2651   // printf("\nflags=%x\n", flags);
2652 #endif // vectors
2653 }
2654 
2655 /*
2656 #ifdef Z7_BLAKE2S_USE_VECTORS
2657 void align_test2(CBlake2sp *sp);
2658 void align_test2(CBlake2sp *sp)
2659 {
2660   __m128i a = LOAD_128(sp->states);
2661   D_XOR_128(a, LOAD_128(sp->states + 4));
2662   STORE_128(sp->states, a);
2663 }
2664 void align_test2(void);
2665 void align_test2(void)
2666 {
2667   CBlake2sp sp;
2668   Blake2sp_Init(&sp);
2669   Blake2sp_Update(&sp, NULL, 0);
2670 }
2671 #endif
2672 */
2673