1 /* Blake2s.c -- BLAKE2sp Hash
2 2024-05-18 : Igor Pavlov : Public domain
3 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
4
5 #include "Precomp.h"
6
7 // #include <stdio.h>
8 #include <string.h>
9
10 #include "Blake2.h"
11 #include "RotateDefs.h"
12 #include "Compiler.h"
13 #include "CpuArch.h"
14
15 /*
16 if defined(__AVX512F__) && defined(__AVX512VL__)
17 {
18 we define Z7_BLAKE2S_USE_AVX512_ALWAYS,
19 but the compiler can use avx512 for any code.
20 }
21 else if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
22 { we use avx512 only for sse* and avx* branches of code. }
23 */
24 // #define Z7_BLAKE2S_USE_AVX512_ALWAYS // for debug
25
26 #if defined(__SSE2__)
27 #define Z7_BLAKE2S_USE_VECTORS
28 #elif defined(MY_CPU_X86_OR_AMD64)
29 #if defined(_MSC_VER) && _MSC_VER > 1200 \
30 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
31 || defined(__clang__) \
32 || defined(__INTEL_COMPILER)
33 #define Z7_BLAKE2S_USE_VECTORS
34 #endif
35 #endif
36
37 #ifdef Z7_BLAKE2S_USE_VECTORS
38
39 #define Z7_BLAKE2SP_USE_FUNCTIONS
40
41 // define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
42 // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
43
44 // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
45 #if defined(__SSSE3__)
46 #define Z7_BLAKE2S_USE_SSSE3
47 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
48 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
49 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
50 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
51 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
52 #define Z7_BLAKE2S_USE_SSSE3
53 #endif
54
55 #ifdef Z7_BLAKE2S_USE_SSSE3
56 /* SSE41 : for _mm_insert_epi32 (pinsrd)
57 it can slightly reduce code size and improves the performance in some cases.
58 it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
59 it can be used for all blocks in another algos (4+).
60 */
61 #if defined(__SSE4_1__)
62 #define Z7_BLAKE2S_USE_SSE41
63 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
64 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
65 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
66 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
67 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
68 #define Z7_BLAKE2S_USE_SSE41
69 #endif
70 #endif // SSSE3
71
72 #if defined(__GNUC__) || defined(__clang__)
73 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
74 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("avx512vl,avx512f")))
75 #else
76 #if defined(Z7_BLAKE2S_USE_SSE41)
77 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1")))
78 #elif defined(Z7_BLAKE2S_USE_SSSE3)
79 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3")))
80 #else
81 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2")))
82 #endif
83 #endif
84 #endif
85
86
87 #if defined(__AVX2__)
88 #define Z7_BLAKE2S_USE_AVX2
89 #else
90 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
91 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
92 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
93 #define Z7_BLAKE2S_USE_AVX2
94 #ifdef Z7_BLAKE2S_USE_AVX2
95 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
96 #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx512vl,avx512f")))
97 #else
98 #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2")))
99 #endif
100 #endif
101 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
102 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
103 #if (Z7_MSC_VER_ORIGINAL == 1900)
104 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
105 #endif
106 #define Z7_BLAKE2S_USE_AVX2
107 #endif
108 #endif
109
110 #ifdef Z7_BLAKE2S_USE_SSE41
111 #include <smmintrin.h> // SSE4.1
112 #elif defined(Z7_BLAKE2S_USE_SSSE3)
113 #include <tmmintrin.h> // SSSE3
114 #else
115 #include <emmintrin.h> // SSE2
116 #endif
117
118 #ifdef Z7_BLAKE2S_USE_AVX2
119 #include <immintrin.h>
120 #if defined(__clang__)
121 #include <avxintrin.h>
122 #include <avx2intrin.h>
123 #endif
124 #endif // avx2
125
126
127 #if defined(__AVX512F__) && defined(__AVX512VL__)
128 // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
129 #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
130 #define Z7_BLAKE2S_USE_AVX512_ALWAYS
131 #endif
132 // #pragma message ("=== Blake2s AVX512")
133 #endif
134
135
136 #define Z7_BLAKE2S_USE_V128_FAST
137 // for speed optimization for small messages:
138 // #define Z7_BLAKE2S_USE_V128_WAY2
139
140 #ifdef Z7_BLAKE2S_USE_AVX2
141
142 // for debug:
143 // gather is slow
144 // #define Z7_BLAKE2S_USE_GATHER
145
146 #define Z7_BLAKE2S_USE_AVX2_FAST
147 // for speed optimization for small messages:
148 // #define Z7_BLAKE2S_USE_AVX2_WAY2
149 // #define Z7_BLAKE2S_USE_AVX2_WAY4
150 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
151 defined(Z7_BLAKE2S_USE_AVX2_WAY4)
152 #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW
153 #endif
154 #endif
155
156 #define Z7_BLAKE2SP_ALGO_DEFAULT 0
157 #define Z7_BLAKE2SP_ALGO_SCALAR 1
158 #ifdef Z7_BLAKE2S_USE_V128_FAST
159 #define Z7_BLAKE2SP_ALGO_V128_FAST 2
160 #endif
161 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
162 #define Z7_BLAKE2SP_ALGO_V256_FAST 3
163 #endif
164 #define Z7_BLAKE2SP_ALGO_V128_WAY1 4
165 #ifdef Z7_BLAKE2S_USE_V128_WAY2
166 #define Z7_BLAKE2SP_ALGO_V128_WAY2 5
167 #endif
168 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
169 #define Z7_BLAKE2SP_ALGO_V256_WAY2 6
170 #endif
171 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
172 #define Z7_BLAKE2SP_ALGO_V256_WAY4 7
173 #endif
174
175 #endif // Z7_BLAKE2S_USE_VECTORS
176
177
178
179
180 #define BLAKE2S_FINAL_FLAG (~(UInt32)0)
181 #define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS
182 #define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
183 #define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1)
184
185 #define V_INDEX_0_0 0
186 #define V_INDEX_1_0 1
187 #define V_INDEX_2_0 2
188 #define V_INDEX_3_0 3
189 #define V_INDEX_0_1 4
190 #define V_INDEX_1_1 5
191 #define V_INDEX_2_1 6
192 #define V_INDEX_3_1 7
193 #define V_INDEX_0_2 8
194 #define V_INDEX_1_2 9
195 #define V_INDEX_2_2 10
196 #define V_INDEX_3_2 11
197 #define V_INDEX_0_3 12
198 #define V_INDEX_1_3 13
199 #define V_INDEX_2_3 14
200 #define V_INDEX_3_3 15
201 #define V_INDEX_4_0 0
202 #define V_INDEX_5_0 1
203 #define V_INDEX_6_0 2
204 #define V_INDEX_7_0 3
205 #define V_INDEX_7_1 4
206 #define V_INDEX_4_1 5
207 #define V_INDEX_5_1 6
208 #define V_INDEX_6_1 7
209 #define V_INDEX_6_2 8
210 #define V_INDEX_7_2 9
211 #define V_INDEX_4_2 10
212 #define V_INDEX_5_2 11
213 #define V_INDEX_5_3 12
214 #define V_INDEX_6_3 13
215 #define V_INDEX_7_3 14
216 #define V_INDEX_4_3 15
217
218 #define V(row, col) v[V_INDEX_ ## row ## _ ## col]
219
220 #define k_Blake2s_IV_0 0x6A09E667UL
221 #define k_Blake2s_IV_1 0xBB67AE85UL
222 #define k_Blake2s_IV_2 0x3C6EF372UL
223 #define k_Blake2s_IV_3 0xA54FF53AUL
224 #define k_Blake2s_IV_4 0x510E527FUL
225 #define k_Blake2s_IV_5 0x9B05688CUL
226 #define k_Blake2s_IV_6 0x1F83D9ABUL
227 #define k_Blake2s_IV_7 0x5BE0CD19UL
228
229 #define KIV(n) (k_Blake2s_IV_## n)
230
231 #ifdef Z7_BLAKE2S_USE_VECTORS
232 MY_ALIGN(16)
233 static const UInt32 k_Blake2s_IV[8] =
234 {
235 KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
236 };
237 #endif
238
239 #define STATE_T(s) ((s) + 8)
240 #define STATE_F(s) ((s) + 10)
241
242 #ifdef Z7_BLAKE2S_USE_VECTORS
243
244 #define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
245 #define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
246 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
247 // here we use unaligned load and stores
248 // use this branch if CBlake2sp can be unaligned for 16 bytes
249 #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
250 #define LOAD_128_FROM_STRUCT(p) LOADU_128(p)
251 #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r)
252 #else
253 // here we use aligned load and stores
254 // use this branch if CBlake2sp is aligned for 16 bytes
255 #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r)
256 #define LOAD_128_FROM_STRUCT(p) LOAD_128(p)
257 #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r)
258 #endif
259
260 #endif // Z7_BLAKE2S_USE_VECTORS
261
262
263 #if 0
264 static void PrintState(const UInt32 *s, unsigned num)
265 {
266 unsigned i;
267 printf("\n");
268 for (i = 0; i < num; i++)
269 printf(" %08x", (unsigned)s[i]);
270 }
271 static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
272 {
273 unsigned i;
274 for (i = 0; i < y; i++)
275 PrintState(s + i * x, x);
276 printf("\n");
277 }
278 #endif
279
280
281 #define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
282
283 #define BLAKE2S_NUM_ROUNDS 10
284
285 #if defined(Z7_BLAKE2S_USE_VECTORS)
286 #define ROUNDS_LOOP(mac) \
287 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
288 #endif
289 /*
290 #define ROUNDS_LOOP_2(mac) \
291 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
292 */
293 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
294 #define ROUNDS_LOOP_UNROLLED(m) \
295 { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
296 #endif
297
298 #define SIGMA_TABLE(M) \
299 M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \
300 M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \
301 M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \
302 M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \
303 M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \
304 M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \
305 M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \
306 M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \
307 M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \
308 M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 )
309
310 #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
311 { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
312 #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
313 SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
314
315 // MY_ALIGN(32)
316 MY_ALIGN(16)
317 static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
318 { SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
319
320 #define GET_SIGMA_PTR(p, index) \
321 ((const void *)((const Byte *)(const void *)(p) + (index)))
322
323 #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
324 ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
325
326
327 #ifdef Z7_BLAKE2S_USE_VECTORS
328
329
330 #if 0
331 // use loading constants from memory
332 // is faster for some compilers.
333 #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n)
334 MY_ALIGN(64)
335 static const UInt32 k_Blake2s_IV_WAY4[]=
336 {
337 KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
338 };
339 #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
340 #else
341 // use constant generation:
342 #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i))
343 #endif
344
345
346 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
347 #define GET_CONST_128_FROM_ARRAY32(k) \
348 _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
349 #endif
350
351
352 #if 0
353 #define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
354 #define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
355 #define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
356 #define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
357 #define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
358 #else
359 #if defined(Z7_BLAKE2S_USE_SSSE3) && \
360 !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
361 MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
362 MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
363 #define k_r8 LOAD_128(k_r8_arr)
364 #define k_r16 LOAD_128(k_r16_arr)
365 #endif
366 MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
367 #define k_inc LOAD_128(k_inc_arr)
368 #define k_iv0_128 LOAD_128(k_Blake2s_IV + 0)
369 #define k_iv4_128 LOAD_128(k_Blake2s_IV + 4)
370 #endif
371
372
373 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
374
375 #ifdef Z7_BLAKE2S_USE_AVX2
376 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
377 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
378 #else
379 #define MY_mm256_set_m128i _mm256_set_m128i
380 #endif
381
382 #define SET_FROM_128(a) MY_mm256_set_m128i(a, a)
383
384 #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
385 MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
386 {
387 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
388 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
389 };
390 MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
391 {
392 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
393 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
394 };
395 #define k_r8_256 LOAD_256(k_r8_arr_256)
396 #define k_r16_256 LOAD_256(k_r16_arr_256)
397 #endif
398
399 // #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
400 // #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
401 // #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
402 // #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
403 #define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
404 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
405 #endif
406
407
408 /*
409 IPC(TP) ports:
410 1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps
411 2 p_15 : icl+
412 1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps
413 3 p015 : skl+
414
415 3 p015 : SSE2 : pxor : _mm_xor_si128
416 2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32
417 2 p0_5: mrm-wsm :
418 3 p015 : skl+
419
420 2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
421 2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32
422 2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16
423 2 p_15 : : SSE2 : psrldq :
424 2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8
425 2 p_15 : : SSE4 : pblendw : _mm_blend_epi16
426 1 p__5 : hsw-skl : *
427
428 1 p0 : SSE2 : pslld (i8) : _mm_slli_si128
429 2 p01 : skl+ :
430
431 2 p_15 : ivb- : SSE3 : palignr
432 1 p__5 : hsw+
433
434 2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8)
435 1 p__5 + p23 : hsw-skl
436 1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8)
437 0.5 2*p5 : hsw-skl
438
439 2 p23 : SSE2 : movd (m32)
440 3 p23A : adl :
441 1 p5: : SSE2 : movd (r32)
442 */
443
444 #if 0 && defined(__XOP__)
445 // we must debug and test __XOP__ instruction
446 #include <x86intrin.h>
447 #include <ammintrin.h>
448 #define LOAD_ROTATE_CONSTS
449 #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c))
450 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
451 #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
452 #define LOAD_ROTATE_CONSTS
453 #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c)
454 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
455 #else
456
457 // MSVC_1937+ uses "orps" instruction for _mm_or_si128().
458 // But "orps" has low throughput: TP=1 for bdw-nhm.
459 // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
460 // But "orps" is fast for modern cpus (skl+).
461 // So we are default with "or" version:
462 #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
463 // minor optimization for some old cpus, if "xorps" is slow.
464 #define MM128_EPI32_OR_or_ADD _mm_add_epi32
465 #else
466 #define MM128_EPI32_OR_or_ADD _mm_or_si128
467 #endif
468
469 #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
470 MM128_EPI32_OR_or_ADD( \
471 _mm_srli_epi32((r), (c)), \
472 _mm_slli_epi32((r), 32-(c))))
473 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
474 #define LOAD_ROTATE_CONSTS \
475 const __m128i r8 = k_r8; \
476 const __m128i r16 = k_r16;
477 #define MM_ROR_EPI32(r, c) ( \
478 ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
479 : (16==(c)) ? _mm_shuffle_epi8(r,r16) \
480 : MM_ROR_EPI32_VIA_SHIFT(r, c))
481 #else
482 #define LOAD_ROTATE_CONSTS
483 #define MM_ROR_EPI32(r, c) ( \
484 (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
485 : MM_ROR_EPI32_VIA_SHIFT(r, c))
486 #endif
487 #endif
488
489 /*
490 we have 3 main ways to load 4 32-bit integers to __m128i:
491 1) SSE2: _mm_set_epi32()
492 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
493 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
494 good compiler for _mm_set_epi32() generates these instructions:
495 {
496 movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
497 }
498 good new compiler generates one instruction
499 {
500 for _mm_insert_epi32() : { pinsrd xmm, [m32], i }
501 for _mm_cvtsi32_si128() : { movd xmm, [m32] }
502 }
503 but vc2010 generates slow pair of instructions:
504 {
505 for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i }
506 for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 }
507 }
508 _mm_insert_epi32() (pinsrd) code reduces xmm register pressure
509 in comparison with _mm_set_epi32() (movd + vpunpckld) code.
510 Note that variant with "movd xmm, r32" can be more slow,
511 but register pressure can be more important.
512 So we can force to "pinsrd" always.
513 */
514 // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
515 #ifdef Z7_BLAKE2S_USE_SSE41
516 /* _mm_set_epi32() can be more effective for GCC and CLANG
517 _mm_insert_epi32() is more effective for MSVC */
518 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
519 #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
520 #endif
521 #endif // USE_SSE41
522 // #endif
523
524 #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
525 // for SSE4.1
526 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
527 _mm_insert_epi32( \
528 _mm_insert_epi32( \
529 _mm_insert_epi32( \
530 _mm_cvtsi32_si128( \
531 *(const Int32 *)p0), \
532 *(const Int32 *)p1, 1), \
533 *(const Int32 *)p2, 2), \
534 *(const Int32 *)p3, 3)
535 #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
536 /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
537 Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
538 But _mm_set_epi32() is more effective for GCC and CLANG.
539 So we use _mm_unpacklo_epi32 for MSVC only */
540 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
541 _mm_unpacklo_epi64( \
542 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \
543 _mm_cvtsi32_si128(*(const Int32 *)p1)), \
544 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \
545 _mm_cvtsi32_si128(*(const Int32 *)p3)))
546 #else
547 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
548 _mm_set_epi32( \
549 *(const Int32 *)p3, \
550 *(const Int32 *)p2, \
551 *(const Int32 *)p1, \
552 *(const Int32 *)p0)
553 #endif
554
555 #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \
556 MM_LOAD_EPI32_FROM_4_POINTERS( \
557 GET_SIGMA_PTR(input, i0), \
558 GET_SIGMA_PTR(input, i1), \
559 GET_SIGMA_PTR(input, i2), \
560 GET_SIGMA_PTR(input, i3))
561
562 #define SET_ROW_FROM_SIGMA(input, sigma_index) \
563 SET_ROW_FROM_SIGMA_BASE(input, \
564 sigma[(sigma_index) ], \
565 sigma[(sigma_index) + 2 * 1], \
566 sigma[(sigma_index) + 2 * 2], \
567 sigma[(sigma_index) + 2 * 3]) \
568
569
570 #define ADD_128(a, b) _mm_add_epi32(a, b)
571 #define XOR_128(a, b) _mm_xor_si128(a, b)
572
573 #define D_ADD_128(dest, src) dest = ADD_128(dest, src)
574 #define D_XOR_128(dest, src) dest = XOR_128(dest, src)
575 #define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift)
576 #define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src)
577
578
579 #define AXR(a, b, d, shift) \
580 D_ADD_128(a, b); \
581 D_XOR_128(d, a); \
582 D_ROR_128(d, shift);
583
584 #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
585 a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
586 AXR(a, b, d, shift1) \
587 AXR(c, d, b, shift2)
588
589 #define ROTATE_WORDS_TO_RIGHT(a, n) \
590 a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
591
592 #define AXR4(a, b, c, d, input, sigma_index) \
593 AXR2(a, b, c, d, input, sigma_index, 16, 12) \
594 AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \
595
596 #define RR2(a, b, c, d, input) \
597 { \
598 AXR4(a, b, c, d, input, 0) \
599 ROTATE_WORDS_TO_RIGHT(b, 1) \
600 ROTATE_WORDS_TO_RIGHT(c, 2) \
601 ROTATE_WORDS_TO_RIGHT(d, 3) \
602 AXR4(a, b, c, d, input, 8) \
603 ROTATE_WORDS_TO_RIGHT(b, 3) \
604 ROTATE_WORDS_TO_RIGHT(c, 2) \
605 ROTATE_WORDS_TO_RIGHT(d, 1) \
606 }
607
608
609 /*
610 Way1:
611 per 64 bytes block:
612 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
613 * (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
614 additional operations per 7_op_iter :
615 4 movzx byte mem
616 1 movd mem
617 3 pinsrd mem
618 1.5 pshufd
619 */
620
621 static
622 #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
623 defined(Z7_BLAKE2S_USE_V256_WAY2))
624 Z7_NO_INLINE
625 #else
626 Z7_FORCE_INLINE
627 #endif
628 #ifdef BLAKE2S_ATTRIB_128BIT
629 BLAKE2S_ATTRIB_128BIT
630 #endif
631 void
632 Z7_FASTCALL
Blake2s_Compress_V128_Way1(UInt32 * const s,const Byte * const input)633 Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
634 {
635 __m128i a, b, c, d;
636 __m128i f0, f1;
637
638 LOAD_ROTATE_CONSTS
639 d = LOAD_128_FROM_STRUCT(STATE_T(s));
640 c = k_iv0_128;
641 a = f0 = LOAD_128_FROM_STRUCT(s);
642 b = f1 = LOAD_128_FROM_STRUCT(s + 4);
643 D_ADD_EPI64_128(d, k_inc);
644 STORE_128_TO_STRUCT (STATE_T(s), d);
645 D_XOR_128(d, k_iv4_128);
646
647 #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
648 RR2(a, b, c, d, input) }
649
650 ROUNDS_LOOP(RR)
651 #undef RR
652
653 STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c)));
654 STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
655 }
656
657
658 static
659 Z7_NO_INLINE
660 #ifdef BLAKE2S_ATTRIB_128BIT
661 BLAKE2S_ATTRIB_128BIT
662 #endif
663 void
664 Z7_FASTCALL
Blake2sp_Compress2_V128_Way1(UInt32 * s_items,const Byte * data,const Byte * end)665 Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
666 {
667 size_t pos = 0;
668 do
669 {
670 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
671 Blake2s_Compress_V128_Way1(s, data);
672 data += Z7_BLAKE2S_BLOCK_SIZE;
673 pos += Z7_BLAKE2S_BLOCK_SIZE;
674 pos &= SUPER_BLOCK_MASK;
675 }
676 while (data != end);
677 }
678
679
680 #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
681 defined(Z7_BLAKE2S_USE_AVX2_WAY2)
682 #if 1
683 #define Z7_BLAKE2S_CompressSingleBlock(s, data) \
684 Blake2sp_Compress2_V128_Way1(s, data, \
685 (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
686 #else
687 #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1
688 #endif
689 #endif
690
691
692 #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
693 defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
694 !defined(Z7_BLAKE2S_USE_GATHER)
695 #define AXR2_LOAD_INDEXES(sigma_index) \
696 const unsigned i0 = sigma[(sigma_index)]; \
697 const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
698 const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
699 const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
700
701 #define SET_ROW_FROM_SIGMA_W(input) \
702 SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
703 #endif
704
705
706 #ifdef Z7_BLAKE2S_USE_V128_WAY2
707
708 #if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
709 /* we use SET_ROW_FROM_SIGMA_BASE, that uses
710 (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
711 (SSE2) _mm_set_epi32()
712 MSVC can be faster for this branch:
713 */
714 #define AXR2_W(sigma_index, shift1, shift2) \
715 { \
716 AXR2_LOAD_INDEXES(sigma_index) \
717 a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
718 a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
719 AXR(a0, b0, d0, shift1) \
720 AXR(a1, b1, d1, shift1) \
721 AXR(c0, d0, b0, shift2) \
722 AXR(c1, d1, b1, shift2) \
723 }
724 #else
725 /* we use interleaved _mm_insert_epi32():
726 GCC can be faster for this branch:
727 */
728 #define AXR2_W_PRE_INSERT(sigma_index, i) \
729 { const unsigned ii = sigma[(sigma_index) + i * 2]; \
730 t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \
731 t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
732 }
733 #define AXR2_W(sigma_index, shift1, shift2) \
734 { __m128i t0, t1; \
735 { const unsigned ii = sigma[sigma_index]; \
736 t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
737 t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
738 } \
739 AXR2_W_PRE_INSERT(sigma_index, 1) \
740 AXR2_W_PRE_INSERT(sigma_index, 2) \
741 AXR2_W_PRE_INSERT(sigma_index, 3) \
742 a0 = _mm_add_epi32(a0, t0); \
743 a1 = _mm_add_epi32(a1, t1); \
744 AXR(a0, b0, d0, shift1) \
745 AXR(a1, b1, d1, shift1) \
746 AXR(c0, d0, b0, shift2) \
747 AXR(c1, d1, b1, shift2) \
748 }
749 #endif
750
751
752 #define AXR4_W(sigma_index) \
753 AXR2_W(sigma_index, 16, 12) \
754 AXR2_W(sigma_index + 1, 8, 7) \
755
756 #define WW(r) \
757 { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
758 AXR4_W(0) \
759 ROTATE_WORDS_TO_RIGHT(b0, 1) \
760 ROTATE_WORDS_TO_RIGHT(b1, 1) \
761 ROTATE_WORDS_TO_RIGHT(c0, 2) \
762 ROTATE_WORDS_TO_RIGHT(c1, 2) \
763 ROTATE_WORDS_TO_RIGHT(d0, 3) \
764 ROTATE_WORDS_TO_RIGHT(d1, 3) \
765 AXR4_W(8) \
766 ROTATE_WORDS_TO_RIGHT(b0, 3) \
767 ROTATE_WORDS_TO_RIGHT(b1, 3) \
768 ROTATE_WORDS_TO_RIGHT(c0, 2) \
769 ROTATE_WORDS_TO_RIGHT(c1, 2) \
770 ROTATE_WORDS_TO_RIGHT(d0, 1) \
771 ROTATE_WORDS_TO_RIGHT(d1, 1) \
772 }
773
774
775 static
776 Z7_NO_INLINE
777 #ifdef BLAKE2S_ATTRIB_128BIT
778 BLAKE2S_ATTRIB_128BIT
779 #endif
780 void
781 Z7_FASTCALL
Blake2sp_Compress2_V128_Way2(UInt32 * s_items,const Byte * data,const Byte * end)782 Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
783 {
784 size_t pos = 0;
785 end -= Z7_BLAKE2S_BLOCK_SIZE;
786
787 if (data != end)
788 {
789 LOAD_ROTATE_CONSTS
790 do
791 {
792 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
793 __m128i a0, b0, c0, d0;
794 __m128i a1, b1, c1, d1;
795 {
796 const __m128i inc = k_inc;
797 const __m128i temp = k_iv4_128;
798 d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
799 d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
800 D_ADD_EPI64_128(d0, inc);
801 D_ADD_EPI64_128(d1, inc);
802 STORE_128_TO_STRUCT (STATE_T(s ), d0);
803 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
804 D_XOR_128(d0, temp);
805 D_XOR_128(d1, temp);
806 }
807 c1 = c0 = k_iv0_128;
808 a0 = LOAD_128_FROM_STRUCT(s);
809 b0 = LOAD_128_FROM_STRUCT(s + 4);
810 a1 = LOAD_128_FROM_STRUCT(s + NSW);
811 b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
812
813 ROUNDS_LOOP (WW)
814
815 #undef WW
816
817 D_XOR_128(a0, c0);
818 D_XOR_128(b0, d0);
819 D_XOR_128(a1, c1);
820 D_XOR_128(b1, d1);
821
822 D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
823 D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
824 D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
825 D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
826
827 STORE_128_TO_STRUCT(s, a0);
828 STORE_128_TO_STRUCT(s + 4, b0);
829 STORE_128_TO_STRUCT(s + NSW, a1);
830 STORE_128_TO_STRUCT(s + NSW + 4, b1);
831
832 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
833 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
834 pos &= SUPER_BLOCK_MASK;
835 }
836 while (data < end);
837 if (data != end)
838 return;
839 }
840 {
841 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
842 Z7_BLAKE2S_CompressSingleBlock(s, data);
843 }
844 }
845 #endif // Z7_BLAKE2S_USE_V128_WAY2
846
847
848 #ifdef Z7_BLAKE2S_USE_V128_WAY2
849 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2
850 #else
851 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1
852 #endif
853
854
855
856 #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
857 #define ROT_128_8(x) MM_ROR_EPI32(x, 8)
858 #define ROT_128_16(x) MM_ROR_EPI32(x, 16)
859 #define ROT_128_7(x) MM_ROR_EPI32(x, 7)
860 #define ROT_128_12(x) MM_ROR_EPI32(x, 12)
861 #else
862 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
863 #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8
864 #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16
865 #else
866 #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8)
867 #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16)
868 #endif
869 #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7)
870 #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12)
871 #endif
872
873
874 #if 1
875 // this branch can provide similar speed on x86* in most cases,
876 // because [base + index*4] provides same speed as [base + index].
877 // but some compilers can generate different code with this branch, that can be faster sometimes.
878 // this branch uses additional table of 10*16=160 bytes.
879 #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
880 SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
881 MY_ALIGN(16)
882 static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
883 { SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
884 #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r];
885 #define GET_SIGMA_VAL_128(n) (sigma[n])
886 #else
887 #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
888 #define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n])
889 #endif
890
891
892 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
893 #if 1
894 #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
895 SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
896 MY_ALIGN(64)
897 static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
898 { SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
899 #define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
900 #define GET_SIGMA_VAL_256(n) (sigma[n])
901 #else
902 #define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
903 #define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n])
904 #endif
905 #endif // Z7_BLAKE2S_USE_AVX2_FAST
906
907
908 #define D_ROT_128_7(dest) dest = ROT_128_7(dest)
909 #define D_ROT_128_8(dest) dest = ROT_128_8(dest)
910 #define D_ROT_128_12(dest) dest = ROT_128_12(dest)
911 #define D_ROT_128_16(dest) dest = ROT_128_16(dest)
912
913 #define OP_L(a, i) D_ADD_128 (V(a, 0), \
914 LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
915
916 #define OP_0(a) OP_L(a, 0)
917 #define OP_7(a) OP_L(a, 1)
918
919 #define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1));
920 #define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0));
921 #define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3));
922 #define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2));
923
924 #define OP_3(a) D_ROT_128_16 (V(a, 3));
925 #define OP_6(a) D_ROT_128_12 (V(a, 1));
926 #define OP_8(a) D_ROT_128_8 (V(a, 3));
927 #define OP_9(a) D_ROT_128_7 (V(a, 1));
928
929
930 // for 32-bit x86 : interleave mode works slower, because of register pressure.
931
932 #if 0 || 1 && (defined(MY_CPU_X86) \
933 || defined(__GNUC__) && !defined(__clang__))
934 // non-inteleaved version:
935 // is fast for x86 32-bit.
936 // is fast for GCC x86-64.
937
938 #define V4G(a) \
939 OP_0 (a) \
940 OP_1 (a) \
941 OP_2 (a) \
942 OP_3 (a) \
943 OP_4 (a) \
944 OP_5 (a) \
945 OP_6 (a) \
946 OP_7 (a) \
947 OP_1 (a) \
948 OP_2 (a) \
949 OP_8 (a) \
950 OP_4 (a) \
951 OP_5 (a) \
952 OP_9 (a) \
953
954 #define V4R \
955 { \
956 V4G (0) \
957 V4G (1) \
958 V4G (2) \
959 V4G (3) \
960 V4G (4) \
961 V4G (5) \
962 V4G (6) \
963 V4G (7) \
964 }
965
966 #elif 0 || 1 && defined(MY_CPU_X86)
967
968 #define OP_INTER_2(op, a,b) \
969 op (a) \
970 op (b) \
971
972 #define V4G(a,b) \
973 OP_INTER_2 (OP_0, a,b) \
974 OP_INTER_2 (OP_1, a,b) \
975 OP_INTER_2 (OP_2, a,b) \
976 OP_INTER_2 (OP_3, a,b) \
977 OP_INTER_2 (OP_4, a,b) \
978 OP_INTER_2 (OP_5, a,b) \
979 OP_INTER_2 (OP_6, a,b) \
980 OP_INTER_2 (OP_7, a,b) \
981 OP_INTER_2 (OP_1, a,b) \
982 OP_INTER_2 (OP_2, a,b) \
983 OP_INTER_2 (OP_8, a,b) \
984 OP_INTER_2 (OP_4, a,b) \
985 OP_INTER_2 (OP_5, a,b) \
986 OP_INTER_2 (OP_9, a,b) \
987
988 #define V4R \
989 { \
990 V4G (0, 1) \
991 V4G (2, 3) \
992 V4G (4, 5) \
993 V4G (6, 7) \
994 }
995
996 #else
997 // iterleave-4 version is fast for x64 (MSVC/CLANG)
998
999 #define OP_INTER_4(op, a,b,c,d) \
1000 op (a) \
1001 op (b) \
1002 op (c) \
1003 op (d) \
1004
1005 #define V4G(a,b,c,d) \
1006 OP_INTER_4 (OP_0, a,b,c,d) \
1007 OP_INTER_4 (OP_1, a,b,c,d) \
1008 OP_INTER_4 (OP_2, a,b,c,d) \
1009 OP_INTER_4 (OP_3, a,b,c,d) \
1010 OP_INTER_4 (OP_4, a,b,c,d) \
1011 OP_INTER_4 (OP_5, a,b,c,d) \
1012 OP_INTER_4 (OP_6, a,b,c,d) \
1013 OP_INTER_4 (OP_7, a,b,c,d) \
1014 OP_INTER_4 (OP_1, a,b,c,d) \
1015 OP_INTER_4 (OP_2, a,b,c,d) \
1016 OP_INTER_4 (OP_8, a,b,c,d) \
1017 OP_INTER_4 (OP_4, a,b,c,d) \
1018 OP_INTER_4 (OP_5, a,b,c,d) \
1019 OP_INTER_4 (OP_9, a,b,c,d) \
1020
1021 #define V4R \
1022 { \
1023 V4G (0, 1, 2, 3) \
1024 V4G (4, 5, 6, 7) \
1025 }
1026
1027 #endif
1028
1029 #define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R }
1030
1031
1032 #define V4_LOAD_MSG_1(w, m, i) \
1033 { \
1034 __m128i m0, m1, m2, m3; \
1035 __m128i t0, t1, t2, t3; \
1036 m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
1037 m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
1038 m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
1039 m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
1040 t0 = _mm_unpacklo_epi32(m0, m1); \
1041 t1 = _mm_unpackhi_epi32(m0, m1); \
1042 t2 = _mm_unpacklo_epi32(m2, m3); \
1043 t3 = _mm_unpackhi_epi32(m2, m3); \
1044 w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
1045 w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
1046 w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
1047 w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
1048 }
1049
1050 #define V4_LOAD_MSG(w, m) \
1051 { \
1052 V4_LOAD_MSG_1 (w, m, 0) \
1053 V4_LOAD_MSG_1 (w, m, 1) \
1054 V4_LOAD_MSG_1 (w, m, 2) \
1055 V4_LOAD_MSG_1 (w, m, 3) \
1056 }
1057
1058 #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
1059 { \
1060 const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \
1061 const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \
1062 d0 = _mm_unpacklo_epi32(v0, v1); \
1063 d1 = _mm_unpackhi_epi32(v0, v1); \
1064 }
1065
1066 #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
1067 { \
1068 STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \
1069 STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \
1070 }
1071
1072 #define V4_UNPACK_STATE(dest32, src32) \
1073 { \
1074 __m128i t0, t1, t2, t3, t4, t5, t6, t7; \
1075 V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \
1076 V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \
1077 V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \
1078 V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \
1079 V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \
1080 V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \
1081 V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \
1082 V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \
1083 }
1084
1085
1086 static
1087 Z7_NO_INLINE
1088 #ifdef BLAKE2S_ATTRIB_128BIT
1089 BLAKE2S_ATTRIB_128BIT
1090 #endif
1091 void
1092 Z7_FASTCALL
Blake2sp_Compress2_V128_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1093 Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1094 {
1095 // PrintStates2(s_items, 8, 16);
1096 size_t pos = 0;
1097 pos /= 2;
1098 do
1099 {
1100 #if defined(Z7_BLAKE2S_USE_SSSE3) && \
1101 !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
1102 const __m128i r8 = k_r8;
1103 const __m128i r16 = k_r16;
1104 #endif
1105 __m128i w[16];
1106 __m128i v[16];
1107 UInt32 *s;
1108 V4_LOAD_MSG(w, data)
1109 s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1110 {
1111 __m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
1112 D_ADD_EPI64_128 (ctr, k_inc);
1113 STORE_128_TO_STRUCT(s + 64, ctr);
1114 v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1115 v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1116 }
1117 v[ 8] = GET_128_IV_WAY4(0);
1118 v[ 9] = GET_128_IV_WAY4(1);
1119 v[10] = GET_128_IV_WAY4(2);
1120 v[11] = GET_128_IV_WAY4(3);
1121 v[14] = GET_128_IV_WAY4(6);
1122 v[15] = GET_128_IV_WAY4(7);
1123
1124 #define LOAD_STATE_128_FROM_STRUCT(i) \
1125 v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
1126
1127 #define UPDATE_STATE_128_IN_STRUCT(i) \
1128 STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
1129 XOR_128(v[i], v[(i) + 8]), \
1130 LOAD_128_FROM_STRUCT(s + (i) * 4)));
1131
1132 REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
1133 ROUNDS_LOOP (V4_ROUND)
1134 REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
1135
1136 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1137 pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
1138 pos &= SUPER_BLOCK_SIZE / 2 - 1;
1139 }
1140 while (data != end);
1141 }
1142
1143
1144 static
1145 Z7_NO_INLINE
1146 #ifdef BLAKE2S_ATTRIB_128BIT
1147 BLAKE2S_ATTRIB_128BIT
1148 #endif
1149 void
1150 Z7_FASTCALL
Blake2sp_Final_V128_Fast(UInt32 * states)1151 Blake2sp_Final_V128_Fast(UInt32 *states)
1152 {
1153 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1154 // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
1155 // PrintStates2(states, 8, 16);
1156 {
1157 ptrdiff_t pos = 8 * 4;
1158 do
1159 {
1160 UInt32 *src32 = states + (size_t)(pos * 1);
1161 UInt32 *dest32 = states + (size_t)(pos * 2);
1162 V4_UNPACK_STATE(dest32, src32)
1163 pos -= 8 * 4;
1164 }
1165 while (pos >= 0);
1166 }
1167 {
1168 unsigned k;
1169 for (k = 0; k < 8; k++)
1170 {
1171 UInt32 *s = states + (size_t)k * 16;
1172 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1173 }
1174 }
1175 // PrintStates2(states, 8, 16);
1176 }
1177
1178
1179
1180 #ifdef Z7_BLAKE2S_USE_AVX2
1181
1182 #define ADD_256(a, b) _mm256_add_epi32(a, b)
1183 #define XOR_256(a, b) _mm256_xor_si256(a, b)
1184
1185 #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
1186 #define MM256_ROR_EPI32 _mm256_ror_epi32
1187 #define Z7_MM256_ROR_EPI32_IS_SUPPORTED
1188 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1189 #define LOAD_ROTATE_CONSTS_256
1190 #endif
1191 #else
1192 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1193 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1194 #define LOAD_ROTATE_CONSTS_256 \
1195 const __m256i r8 = k_r8_256; \
1196 const __m256i r16 = k_r16_256;
1197 #endif // AVX2_WAY2
1198
1199 #define MM256_ROR_EPI32(r, c) ( \
1200 ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
1201 : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
1202 : _mm256_or_si256( \
1203 _mm256_srli_epi32((r), (c)), \
1204 _mm256_slli_epi32((r), 32-(c))))
1205 #endif // WAY_SLOW
1206 #endif
1207
1208
1209 #define D_ADD_256(dest, src) dest = ADD_256(dest, src)
1210 #define D_XOR_256(dest, src) dest = XOR_256(dest, src)
1211
1212 #define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
1213
1214 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1215
1216 #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1217 #define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
1218 #define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
1219 #define ROT_256_8(x) MM256_ROR_EPI32((x), 8)
1220 #define ROT_256_7(x) MM256_ROR_EPI32((x), 7)
1221 #else
1222 #define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
1223 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
1224 #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
1225 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
1226 #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
1227 #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
1228 #define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8)
1229 #define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25))
1230 #endif
1231
1232 #define D_ROT_256_7(dest) dest = ROT_256_7(dest)
1233 #define D_ROT_256_8(dest) dest = ROT_256_8(dest)
1234 #define D_ROT_256_12(dest) dest = ROT_256_12(dest)
1235 #define D_ROT_256_16(dest) dest = ROT_256_16(dest)
1236
1237 #define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p))
1238 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
1239 #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
1240 #define LOAD_256_FROM_STRUCT(p) LOADU_256(p)
1241 #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r)
1242 #else
1243 // if struct is aligned for 32-bytes
1244 #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r)
1245 #define LOAD_256_FROM_STRUCT(p) LOAD_256(p)
1246 #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r)
1247 #endif
1248
1249 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1250
1251
1252
1253 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1254
1255 #if 0
1256 #define DIAG_PERM2(s) \
1257 { \
1258 const __m256i a = LOAD_256_FROM_STRUCT((s) ); \
1259 const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
1260 STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \
1261 STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
1262 }
1263 #else
1264 #define DIAG_PERM2(s) \
1265 { \
1266 const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
1267 const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
1268 STORE_128_TO_STRUCT((s) + NSW, a); \
1269 STORE_128_TO_STRUCT((s) + 4 , b); \
1270 }
1271 #endif
1272 #define DIAG_PERM8(s_items) \
1273 { \
1274 DIAG_PERM2(s_items) \
1275 DIAG_PERM2(s_items + NSW * 2) \
1276 DIAG_PERM2(s_items + NSW * 4) \
1277 DIAG_PERM2(s_items + NSW * 6) \
1278 }
1279
1280
1281 #define AXR256(a, b, d, shift) \
1282 D_ADD_256(a, b); \
1283 D_XOR_256(d, a); \
1284 d = MM256_ROR_EPI32(d, shift); \
1285
1286
1287
1288 #ifdef Z7_BLAKE2S_USE_GATHER
1289
1290 #define TABLE_GATHER_256_4(a0,a1,a2,a3) \
1291 a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
1292 #define TABLE_GATHER_256( \
1293 a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
1294 { TABLE_GATHER_256_4(a0,a2,a4,a6), \
1295 TABLE_GATHER_256_4(a1,a3,a5,a7), \
1296 TABLE_GATHER_256_4(a8,a10,a12,a14), \
1297 TABLE_GATHER_256_4(a9,a11,a13,a15) }
1298 MY_ALIGN(64)
1299 static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
1300 { SIGMA_TABLE(TABLE_GATHER_256) };
1301 #define GET_SIGMA(r) \
1302 const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
1303 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1304 const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
1305 #define SET_ROW_FROM_SIGMA_AVX(in) \
1306 _mm256_i32gather_epi32((const void *)(in), i01234567, 4)
1307 #define SIGMA_INTERLEAVE 8
1308 #define SIGMA_HALF_ROW_SIZE 16
1309
1310 #else // !Z7_BLAKE2S_USE_GATHER
1311
1312 #define GET_SIGMA(r) \
1313 const Byte * const sigma = k_Blake2s_Sigma_4[r];
1314 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1315 AXR2_LOAD_INDEXES(sigma_index)
1316 #define SET_ROW_FROM_SIGMA_AVX(in) \
1317 MY_mm256_set_m128i( \
1318 SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
1319 SET_ROW_FROM_SIGMA_W(in))
1320 #define SIGMA_INTERLEAVE 1
1321 #define SIGMA_HALF_ROW_SIZE 8
1322 #endif // !Z7_BLAKE2S_USE_GATHER
1323
1324
1325 #define ROTATE_WORDS_TO_RIGHT_256(a, n) \
1326 a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
1327
1328
1329
1330 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1331
1332 #define AXR2_A(sigma_index, shift1, shift2) \
1333 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1334 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1335 AXR256(a0, b0, d0, shift1) \
1336 AXR256(c0, d0, b0, shift2) \
1337
1338 #define AXR4_A(sigma_index) \
1339 { AXR2_A(sigma_index, 16, 12) } \
1340 { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1341
1342 #define EE1(r) \
1343 { GET_SIGMA(r) \
1344 AXR4_A(0) \
1345 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1346 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1347 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1348 AXR4_A(SIGMA_HALF_ROW_SIZE) \
1349 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1350 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1351 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1352 }
1353
1354 static
1355 Z7_NO_INLINE
1356 #ifdef BLAKE2S_ATTRIB_AVX2
1357 BLAKE2S_ATTRIB_AVX2
1358 #endif
1359 void
1360 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way2(UInt32 * s_items,const Byte * data,const Byte * end)1361 Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
1362 {
1363 size_t pos = 0;
1364 end -= Z7_BLAKE2S_BLOCK_SIZE;
1365
1366 if (data != end)
1367 {
1368 LOAD_ROTATE_CONSTS_256
1369 DIAG_PERM8(s_items)
1370 do
1371 {
1372 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1373 __m256i a0, b0, c0, d0;
1374 {
1375 const __m128i inc = k_inc;
1376 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1377 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1378 D_ADD_EPI64_128(d0_128, inc);
1379 D_ADD_EPI64_128(d1_128, inc);
1380 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1381 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
1382 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1383 D_XOR_256(d0, k_iv4_256);
1384 }
1385 c0 = SET_FROM_128(k_iv0_128);
1386 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1387 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1388
1389 ROUNDS_LOOP (EE1)
1390
1391 D_XOR_256(a0, c0);
1392 D_XOR_256(b0, d0);
1393
1394 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1395 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1396
1397 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1398 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1399
1400 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
1401 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
1402 pos &= SUPER_BLOCK_MASK;
1403 }
1404 while (data < end);
1405 DIAG_PERM8(s_items)
1406 if (data != end)
1407 return;
1408 }
1409 {
1410 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1411 Z7_BLAKE2S_CompressSingleBlock(s, data);
1412 }
1413 }
1414
1415 #endif // Z7_BLAKE2S_USE_AVX2_WAY2
1416
1417
1418
1419 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
1420
1421 #define AXR2_X(sigma_index, shift1, shift2) \
1422 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1423 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1424 D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
1425 AXR256(a0, b0, d0, shift1) \
1426 AXR256(a1, b1, d1, shift1) \
1427 AXR256(c0, d0, b0, shift2) \
1428 AXR256(c1, d1, b1, shift2) \
1429
1430 #define AXR4_X(sigma_index) \
1431 { AXR2_X(sigma_index, 16, 12) } \
1432 { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1433
1434 #define EE2(r) \
1435 { GET_SIGMA(r) \
1436 AXR4_X(0) \
1437 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1438 ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
1439 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1440 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1441 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1442 ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
1443 AXR4_X(SIGMA_HALF_ROW_SIZE) \
1444 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1445 ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
1446 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1447 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1448 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1449 ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
1450 }
1451
1452 static
1453 Z7_NO_INLINE
1454 #ifdef BLAKE2S_ATTRIB_AVX2
1455 BLAKE2S_ATTRIB_AVX2
1456 #endif
1457 void
1458 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way4(UInt32 * s_items,const Byte * data,const Byte * end)1459 Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
1460 {
1461 size_t pos = 0;
1462
1463 if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
1464 {
1465 #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1466 const __m256i r8 = k_r8_256;
1467 const __m256i r16 = k_r16_256;
1468 #endif
1469 end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
1470 DIAG_PERM8(s_items)
1471 do
1472 {
1473 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1474 __m256i a0, b0, c0, d0;
1475 __m256i a1, b1, c1, d1;
1476 {
1477 const __m128i inc = k_inc;
1478 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1479 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1480 __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
1481 __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
1482 D_ADD_EPI64_128(d0_128, inc);
1483 D_ADD_EPI64_128(d1_128, inc);
1484 D_ADD_EPI64_128(d2_128, inc);
1485 D_ADD_EPI64_128(d3_128, inc);
1486 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1487 STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
1488 STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
1489 STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
1490 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1491 d1 = MY_mm256_set_m128i(d3_128, d2_128);
1492 D_XOR_256(d0, k_iv4_256);
1493 D_XOR_256(d1, k_iv4_256);
1494 }
1495 c1 = c0 = SET_FROM_128(k_iv0_128);
1496 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1497 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1498 a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
1499 b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
1500
1501 ROUNDS_LOOP (EE2)
1502
1503 D_XOR_256(a0, c0);
1504 D_XOR_256(b0, d0);
1505 D_XOR_256(a1, c1);
1506 D_XOR_256(b1, d1);
1507
1508 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1509 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1510 D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
1511 D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
1512
1513 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1514 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1515 STORE_256_TO_STRUCT(s + NSW * 2, a1);
1516 STORE_256_TO_STRUCT(s + NSW * 3, b1);
1517
1518 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1519 pos += Z7_BLAKE2S_BLOCK_SIZE * 4;
1520 pos &= SUPER_BLOCK_MASK;
1521 }
1522 while (data < end);
1523 DIAG_PERM8(s_items)
1524 end += Z7_BLAKE2S_BLOCK_SIZE * 3;
1525 }
1526 if (data == end)
1527 return;
1528 // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
1529 do
1530 {
1531 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1532 Z7_BLAKE2S_CompressSingleBlock(s, data);
1533 data += Z7_BLAKE2S_BLOCK_SIZE;
1534 pos += Z7_BLAKE2S_BLOCK_SIZE;
1535 pos &= SUPER_BLOCK_MASK;
1536 }
1537 while (data != end);
1538 }
1539
1540 #endif // Z7_BLAKE2S_USE_AVX2_WAY4
1541 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1542
1543
1544 // ---------------------------------------------------------
1545
1546 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1547
1548 #define OP256_L(a, i) D_ADD_256 (V(a, 0), \
1549 LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
1550
1551 #define OP256_0(a) OP256_L(a, 0)
1552 #define OP256_7(a) OP256_L(a, 1)
1553
1554 #define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1));
1555 #define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0));
1556 #define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3));
1557 #define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2));
1558
1559 #define OP256_3(a) D_ROT_256_16 (V(a, 3));
1560 #define OP256_6(a) D_ROT_256_12 (V(a, 1));
1561 #define OP256_8(a) D_ROT_256_8 (V(a, 3));
1562 #define OP256_9(a) D_ROT_256_7 (V(a, 1));
1563
1564
1565 #if 0 || 1 && defined(MY_CPU_X86)
1566
1567 #define V8_G(a) \
1568 OP256_0 (a) \
1569 OP256_1 (a) \
1570 OP256_2 (a) \
1571 OP256_3 (a) \
1572 OP256_4 (a) \
1573 OP256_5 (a) \
1574 OP256_6 (a) \
1575 OP256_7 (a) \
1576 OP256_1 (a) \
1577 OP256_2 (a) \
1578 OP256_8 (a) \
1579 OP256_4 (a) \
1580 OP256_5 (a) \
1581 OP256_9 (a) \
1582
1583 #define V8R { \
1584 V8_G (0); \
1585 V8_G (1); \
1586 V8_G (2); \
1587 V8_G (3); \
1588 V8_G (4); \
1589 V8_G (5); \
1590 V8_G (6); \
1591 V8_G (7); \
1592 }
1593
1594 #else
1595
1596 #define OP256_INTER_4(op, a,b,c,d) \
1597 op (a) \
1598 op (b) \
1599 op (c) \
1600 op (d) \
1601
1602 #define V8_G(a,b,c,d) \
1603 OP256_INTER_4 (OP256_0, a,b,c,d) \
1604 OP256_INTER_4 (OP256_1, a,b,c,d) \
1605 OP256_INTER_4 (OP256_2, a,b,c,d) \
1606 OP256_INTER_4 (OP256_3, a,b,c,d) \
1607 OP256_INTER_4 (OP256_4, a,b,c,d) \
1608 OP256_INTER_4 (OP256_5, a,b,c,d) \
1609 OP256_INTER_4 (OP256_6, a,b,c,d) \
1610 OP256_INTER_4 (OP256_7, a,b,c,d) \
1611 OP256_INTER_4 (OP256_1, a,b,c,d) \
1612 OP256_INTER_4 (OP256_2, a,b,c,d) \
1613 OP256_INTER_4 (OP256_8, a,b,c,d) \
1614 OP256_INTER_4 (OP256_4, a,b,c,d) \
1615 OP256_INTER_4 (OP256_5, a,b,c,d) \
1616 OP256_INTER_4 (OP256_9, a,b,c,d) \
1617
1618 #define V8R { \
1619 V8_G (0, 1, 2, 3) \
1620 V8_G (4, 5, 6, 7) \
1621 }
1622 #endif
1623
1624 #define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R }
1625
1626
1627 // for debug:
1628 // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
1629 #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
1630 // gather instruction is slow.
1631 #define V8_LOAD_MSG(w, m) \
1632 { \
1633 unsigned i; \
1634 for (i = 0; i < 16; ++i) { \
1635 w[i] = _mm256_i32gather_epi32( \
1636 (const void *)((m) + i * sizeof(UInt32)),\
1637 _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
1638 sizeof(UInt32)); \
1639 } \
1640 }
1641 #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1642
1643 #define V8_LOAD_MSG_2(w, a0, a1) \
1644 { \
1645 (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \
1646 (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \
1647 }
1648
1649 #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
1650 { \
1651 __m256i s0, s1, s2, s3; \
1652 s0 = _mm256_unpacklo_epi64(z0, z1); \
1653 s1 = _mm256_unpackhi_epi64(z0, z1); \
1654 s2 = _mm256_unpacklo_epi64(z2, z3); \
1655 s3 = _mm256_unpackhi_epi64(z2, z3); \
1656 V8_LOAD_MSG_2((w) + 0, s0, s2) \
1657 V8_LOAD_MSG_2((w) + 1, s1, s3) \
1658 }
1659
1660 #define V8_LOAD_MSG_0(t0, t1, m) \
1661 { \
1662 __m256i m0, m1; \
1663 m0 = LOADU_256(m); \
1664 m1 = LOADU_256((m) + 2 * 32); \
1665 t0 = _mm256_unpacklo_epi32(m0, m1); \
1666 t1 = _mm256_unpackhi_epi32(m0, m1); \
1667 }
1668
1669 #define V8_LOAD_MSG_8(w, m) \
1670 { \
1671 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1672 V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \
1673 V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \
1674 V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \
1675 V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \
1676 V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \
1677 V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \
1678 }
1679
1680 #define V8_LOAD_MSG(w, m) \
1681 { \
1682 V8_LOAD_MSG_8(w, m) \
1683 V8_LOAD_MSG_8((w) + 8, (m) + 32) \
1684 }
1685
1686 #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1687
1688
1689 #define V8_PERM_PAIR_STORE(u, a0, a2) \
1690 { \
1691 STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \
1692 STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \
1693 }
1694
1695 #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
1696 { \
1697 __m256i s0, s1, s2, s3; \
1698 s0 = _mm256_unpacklo_epi64(z0, z1); \
1699 s1 = _mm256_unpackhi_epi64(z0, z1); \
1700 s2 = _mm256_unpacklo_epi64(z2, z3); \
1701 s3 = _mm256_unpackhi_epi64(z2, z3); \
1702 V8_PERM_PAIR_STORE(u + 0, s0, s2) \
1703 V8_PERM_PAIR_STORE(u + 2, s1, s3) \
1704 }
1705
1706 #define V8_UNPACK_STORE_0(src32, d0, d1) \
1707 { \
1708 const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \
1709 const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \
1710 d0 = _mm256_unpacklo_epi32(v0, v1); \
1711 d1 = _mm256_unpackhi_epi32(v0, v1); \
1712 }
1713
1714 #define V8_UNPACK_STATE(dest32, src32) \
1715 { \
1716 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1717 V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \
1718 V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \
1719 V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \
1720 V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \
1721 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \
1722 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \
1723 }
1724
1725
1726
1727 #define V8_LOAD_STATE_256_FROM_STRUCT(i) \
1728 v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
1729
1730 #if 0 || 0 && defined(MY_CPU_X86)
1731 #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1732 #endif
1733
1734 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1735 // this branch doesn't use (iv) array
1736 // so register pressure can be lower.
1737 // it can be faster sometimes
1738 #define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i)
1739 #define V8_UPDATE_STATE_256(i) \
1740 { \
1741 STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
1742 XOR_256(v[i], v[(i) + 8]), \
1743 LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
1744 }
1745 #else
1746 // it uses more variables (iv) registers
1747 // it's better for gcc
1748 // maybe that branch is better, if register pressure will be lower (avx512)
1749 #define V8_LOAD_STATE_256(i) { iv[i] = v[i]; }
1750 #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
1751 #define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
1752 #endif
1753
1754
1755 #if 0
1756 // use loading constants from memory
1757 #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
1758 MY_ALIGN(64)
1759 static const UInt32 k_Blake2s_IV_WAY8[]=
1760 {
1761 KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
1762 };
1763 #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
1764 #else
1765 // use constant generation:
1766 #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i))
1767 #endif
1768
1769
1770 static
1771 Z7_NO_INLINE
1772 #ifdef BLAKE2S_ATTRIB_AVX2
1773 BLAKE2S_ATTRIB_AVX2
1774 #endif
1775 void
1776 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1777 Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1778 {
1779 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1780 __m256i v[16];
1781 #endif
1782
1783 // PrintStates2(s_items, 8, 16);
1784
1785 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1786 REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
1787 #endif
1788
1789 do
1790 {
1791 __m256i w[16];
1792 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1793 __m256i v[16];
1794 #else
1795 __m256i iv[8];
1796 #endif
1797 V8_LOAD_MSG(w, data)
1798 {
1799 // we use load/store ctr inside loop to reduce register pressure:
1800 #if 1 || 1 && defined(MY_CPU_X86)
1801 const __m256i ctr = _mm256_add_epi64(
1802 LOAD_256_FROM_STRUCT(s_items + 64),
1803 _mm256_set_epi32(
1804 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
1805 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
1806 STORE_256_TO_STRUCT(s_items + 64, ctr);
1807 #else
1808 const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
1809 + Z7_BLAKE2S_BLOCK_SIZE;
1810 const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
1811 *(UInt64 *)(void *)(s_items + 64) = ctr64;
1812 #endif
1813 v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1814 v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1815 }
1816 v[ 8] = GET_256_IV_WAY8(0);
1817 v[ 9] = GET_256_IV_WAY8(1);
1818 v[10] = GET_256_IV_WAY8(2);
1819 v[11] = GET_256_IV_WAY8(3);
1820 v[14] = GET_256_IV_WAY8(6);
1821 v[15] = GET_256_IV_WAY8(7);
1822
1823 REP8_MACRO (V8_LOAD_STATE_256)
1824 ROUNDS_LOOP (V8_ROUND)
1825 REP8_MACRO (V8_UPDATE_STATE_256)
1826 data += SUPER_BLOCK_SIZE;
1827 }
1828 while (data != end);
1829
1830 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1831 REP8_MACRO (V8_STORE_STATE_256)
1832 #endif
1833 }
1834
1835
1836 static
1837 Z7_NO_INLINE
1838 #ifdef BLAKE2S_ATTRIB_AVX2
1839 BLAKE2S_ATTRIB_AVX2
1840 #endif
1841 void
1842 Z7_FASTCALL
Blake2sp_Final_AVX2_Fast(UInt32 * states)1843 Blake2sp_Final_AVX2_Fast(UInt32 *states)
1844 {
1845 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1846 // PrintStates2(states, 8, 16);
1847 V8_UNPACK_STATE(states, states)
1848 // PrintStates2(states, 8, 16);
1849 {
1850 unsigned k;
1851 for (k = 0; k < 8; k++)
1852 {
1853 UInt32 *s = states + (size_t)k * 16;
1854 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1855 }
1856 }
1857 // PrintStates2(states, 8, 16);
1858 // printf("\nafter V8_UNPACK_STATE \n");
1859 }
1860
1861 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1862 #endif // avx2
1863 #endif // vector
1864
1865
1866 /*
1867 #define Blake2s_Increment_Counter(s, inc) \
1868 { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
1869 #define Blake2s_Increment_Counter_Small(s, inc) \
1870 { STATE_T(s)[0] += (inc); }
1871 */
1872
1873 #define Blake2s_Set_LastBlock(s) \
1874 { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
1875
1876
1877 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
1878 // good for vs2022
1879 #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
1880 #else
1881 // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
1882 #define LOOP_8(mac) { REP8_MACRO(mac) }
1883 #endif
1884
1885
1886 static
1887 Z7_FORCE_INLINE
1888 // Z7_NO_INLINE
1889 void
1890 Z7_FASTCALL
Blake2s_Compress(UInt32 * s,const Byte * input)1891 Blake2s_Compress(UInt32 *s, const Byte *input)
1892 {
1893 UInt32 m[16];
1894 UInt32 v[16];
1895 {
1896 unsigned i;
1897 for (i = 0; i < 16; i++)
1898 m[i] = GetUi32(input + i * 4);
1899 }
1900
1901 #define INIT_v_FROM_s(i) v[i] = s[i];
1902
1903 LOOP_8(INIT_v_FROM_s)
1904
1905 // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
1906 {
1907 const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
1908 const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
1909 STATE_T(s)[0] = t0;
1910 STATE_T(s)[1] = t1;
1911 v[12] = t0 ^ KIV(4);
1912 v[13] = t1 ^ KIV(5);
1913 }
1914 // v[12] = STATE_T(s)[0] ^ KIV(4);
1915 // v[13] = STATE_T(s)[1] ^ KIV(5);
1916 v[14] = STATE_F(s)[0] ^ KIV(6);
1917 v[15] = STATE_F(s)[1] ^ KIV(7);
1918
1919 v[ 8] = KIV(0);
1920 v[ 9] = KIV(1);
1921 v[10] = KIV(2);
1922 v[11] = KIV(3);
1923 // PrintStates2((const UInt32 *)v, 1, 16);
1924
1925 #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
1926 #define ADD32M(dest, src, a) V(a, dest) += V(a, src);
1927 #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src);
1928 #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift);
1929
1930 // big interleaving can provides big performance gain, if scheduler queues are small.
1931 #if 0 || 1 && defined(MY_CPU_X86)
1932 // interleave-1: for small register number (x86-32bit)
1933 #define G2(index, a, x, y) \
1934 ADD_SIGMA (a, (index) + 2 * 0) \
1935 ADD32M (0, 1, a) \
1936 XOR32M (3, 0, a) \
1937 RTR32M (3, x, a) \
1938 ADD32M (2, 3, a) \
1939 XOR32M (1, 2, a) \
1940 RTR32M (1, y, a) \
1941
1942 #define G(a) \
1943 G2(a * 2 , a, 16, 12) \
1944 G2(a * 2 + 1, a, 8, 7) \
1945
1946 #define R2 \
1947 G(0) \
1948 G(1) \
1949 G(2) \
1950 G(3) \
1951 G(4) \
1952 G(5) \
1953 G(6) \
1954 G(7) \
1955
1956 #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
1957 // interleave-2: is good if the number of registers is not big (x86-64).
1958
1959 #define REP2(mac, dest, src, a, b) \
1960 mac(dest, src, a) \
1961 mac(dest, src, b)
1962
1963 #define G2(index, a, b, x, y) \
1964 ADD_SIGMA (a, (index) + 2 * 0) \
1965 ADD_SIGMA (b, (index) + 2 * 1) \
1966 REP2 (ADD32M, 0, 1, a, b) \
1967 REP2 (XOR32M, 3, 0, a, b) \
1968 REP2 (RTR32M, 3, x, a, b) \
1969 REP2 (ADD32M, 2, 3, a, b) \
1970 REP2 (XOR32M, 1, 2, a, b) \
1971 REP2 (RTR32M, 1, y, a, b) \
1972
1973 #define G(a, b) \
1974 G2(a * 2 , a, b, 16, 12) \
1975 G2(a * 2 + 1, a, b, 8, 7) \
1976
1977 #define R2 \
1978 G(0, 1) \
1979 G(2, 3) \
1980 G(4, 5) \
1981 G(6, 7) \
1982
1983 #else
1984 // interleave-4:
1985 // it has big register pressure for x86/x64.
1986 // and MSVC compilers for x86/x64 are slow for this branch.
1987 // but if we have big number of registers, this branch can be faster.
1988
1989 #define REP4(mac, dest, src, a, b, c, d) \
1990 mac(dest, src, a) \
1991 mac(dest, src, b) \
1992 mac(dest, src, c) \
1993 mac(dest, src, d)
1994
1995 #define G2(index, a, b, c, d, x, y) \
1996 ADD_SIGMA (a, (index) + 2 * 0) \
1997 ADD_SIGMA (b, (index) + 2 * 1) \
1998 ADD_SIGMA (c, (index) + 2 * 2) \
1999 ADD_SIGMA (d, (index) + 2 * 3) \
2000 REP4 (ADD32M, 0, 1, a, b, c, d) \
2001 REP4 (XOR32M, 3, 0, a, b, c, d) \
2002 REP4 (RTR32M, 3, x, a, b, c, d) \
2003 REP4 (ADD32M, 2, 3, a, b, c, d) \
2004 REP4 (XOR32M, 1, 2, a, b, c, d) \
2005 REP4 (RTR32M, 1, y, a, b, c, d) \
2006
2007 #define G(a, b, c, d) \
2008 G2(a * 2 , a, b, c, d, 16, 12) \
2009 G2(a * 2 + 1, a, b, c, d, 8, 7) \
2010
2011 #define R2 \
2012 G(0, 1, 2, 3) \
2013 G(4, 5, 6, 7) \
2014
2015 #endif
2016
2017 #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 }
2018
2019 // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
2020 // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
2021 // 30-60% faster for (arm64-arm32) GCC.
2022 // 5-11% faster for (arm64) CLANG-MAC.
2023 // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
2024 // But if there is vectors branch (for x86*), this scalar code will be unused mostly.
2025 // So we want smaller code (without unrolling) in that case (x86*).
2026 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
2027 #define Z7_BLAKE2S_UNROLL
2028 #endif
2029
2030 #ifdef Z7_BLAKE2S_UNROLL
2031 ROUNDS_LOOP_UNROLLED (R)
2032 #else
2033 ROUNDS_LOOP (R)
2034 #endif
2035
2036 #undef G
2037 #undef G2
2038 #undef R
2039 #undef R2
2040
2041 // printf("\n v after: \n");
2042 // PrintStates2((const UInt32 *)v, 1, 16);
2043 #define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8];
2044
2045 LOOP_8(XOR_s_PAIR_v)
2046 // printf("\n s after:\n");
2047 // PrintStates2((const UInt32 *)s, 1, 16);
2048 }
2049
2050
2051 static
2052 Z7_NO_INLINE
2053 void
2054 Z7_FASTCALL
Blake2sp_Compress2(UInt32 * s_items,const Byte * data,const Byte * end)2055 Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
2056 {
2057 size_t pos = 0;
2058 // PrintStates2(s_items, 8, 16);
2059 do
2060 {
2061 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
2062 Blake2s_Compress(s, data);
2063 data += Z7_BLAKE2S_BLOCK_SIZE;
2064 pos += Z7_BLAKE2S_BLOCK_SIZE;
2065 pos &= SUPER_BLOCK_MASK;
2066 }
2067 while (data != end);
2068 }
2069
2070
2071 #ifdef Z7_BLAKE2S_USE_VECTORS
2072
2073 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2;
2074 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
2075 static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init;
2076 static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final;
2077 static unsigned g_z7_Blake2sp_SupportedFlags;
2078
2079 #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast
2080 #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
2081 #else
2082 #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2
2083 #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
2084 #endif // Z7_BLAKE2S_USE_VECTORS
2085
2086
2087 #if 1 && defined(MY_CPU_LE)
2088 #define GET_DIGEST(_s, _digest) \
2089 { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
2090 #else
2091 #define GET_DIGEST(_s, _digest) \
2092 { unsigned _i; for (_i = 0; _i < 8; _i++) \
2093 { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
2094 }
2095 #endif
2096
2097
2098 /* ---------- BLAKE2s ---------- */
2099 /*
2100 // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
2101 typedef struct
2102 {
2103 Byte digest_length;
2104 Byte key_length;
2105 Byte fanout; // = 1 : in sequential mode
2106 Byte depth; // = 1 : in sequential mode
2107 UInt32 leaf_length;
2108 Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode
2109 Byte node_depth; // 0 for the leaves, or in sequential mode
2110 Byte inner_length; // [0, 32], 0 in sequential mode
2111 Byte salt[BLAKE2S_SALTBYTES];
2112 Byte personal[BLAKE2S_PERSONALBYTES];
2113 } CBlake2sParam;
2114 */
2115
2116 #define k_Blake2sp_IV_0 \
2117 (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
2118 #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \
2119 (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
2120
2121 Z7_FORCE_INLINE
Blake2sp_Init_Spec(UInt32 * s,unsigned node_offset,unsigned node_depth)2122 static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
2123 {
2124 s[0] = k_Blake2sp_IV_0;
2125 s[1] = KIV(1);
2126 s[2] = KIV(2) ^ (UInt32)node_offset;
2127 s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
2128 s[4] = KIV(4);
2129 s[5] = KIV(5);
2130 s[6] = KIV(6);
2131 s[7] = KIV(7);
2132
2133 STATE_T(s)[0] = 0;
2134 STATE_T(s)[1] = 0;
2135 STATE_F(s)[0] = 0;
2136 STATE_F(s)[1] = 0;
2137 }
2138
2139
2140 #ifdef Z7_BLAKE2S_USE_V128_FAST
2141
2142 static
2143 Z7_NO_INLINE
2144 #ifdef BLAKE2S_ATTRIB_128BIT
2145 BLAKE2S_ATTRIB_128BIT
2146 #endif
2147 void
2148 Z7_FASTCALL
Blake2sp_InitState_V128_Fast(UInt32 * states)2149 Blake2sp_InitState_V128_Fast(UInt32 *states)
2150 {
2151 #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
2152 { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \
2153 STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
2154 }
2155 #define STORE_128_PAIR_INIT_STATES_1(i, mac) \
2156 { const __m128i t = mac; \
2157 STORE_128_PAIR_INIT_STATES_2(i, t, t) \
2158 }
2159 #define STORE_128_PAIR_INIT_STATES_IV(i) \
2160 STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
2161
2162 STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
2163 STORE_128_PAIR_INIT_STATES_IV (1)
2164 {
2165 const __m128i t = GET_128_IV_WAY4(2);
2166 STORE_128_PAIR_INIT_STATES_2 (2,
2167 XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
2168 XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
2169 }
2170 STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2171 STORE_128_PAIR_INIT_STATES_IV (4)
2172 STORE_128_PAIR_INIT_STATES_IV (5)
2173 STORE_128_PAIR_INIT_STATES_IV (6)
2174 STORE_128_PAIR_INIT_STATES_IV (7)
2175 STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0))
2176 // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
2177 }
2178
2179 #endif // Z7_BLAKE2S_USE_V128_FAST
2180
2181
2182 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2183
2184 static
2185 Z7_NO_INLINE
2186 #ifdef BLAKE2S_ATTRIB_AVX2
2187 BLAKE2S_ATTRIB_AVX2
2188 #endif
2189 void
2190 Z7_FASTCALL
Blake2sp_InitState_AVX2_Fast(UInt32 * states)2191 Blake2sp_InitState_AVX2_Fast(UInt32 *states)
2192 {
2193 #define STORE_256_INIT_STATES(i, t) \
2194 STORE_256_TO_STRUCT(states + 8 * (i), t);
2195 #define STORE_256_INIT_STATES_IV(i) \
2196 STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
2197
2198 STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
2199 STORE_256_INIT_STATES_IV (1)
2200 STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2),
2201 _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
2202 STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2203 STORE_256_INIT_STATES_IV (4)
2204 STORE_256_INIT_STATES_IV (5)
2205 STORE_256_INIT_STATES_IV (6)
2206 STORE_256_INIT_STATES_IV (7)
2207 STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
2208 // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
2209 }
2210
2211 #endif // Z7_BLAKE2S_USE_AVX2_FAST
2212
2213
2214
2215 Z7_NO_INLINE
Blake2sp_InitState(CBlake2sp * p)2216 void Blake2sp_InitState(CBlake2sp *p)
2217 {
2218 size_t i;
2219 // memset(p->states, 0, sizeof(p->states)); // for debug
2220 p->u.header.cycPos = 0;
2221 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2222 if (p->u.header.func_Init)
2223 {
2224 p->u.header.func_Init(p->states);
2225 return;
2226 }
2227 #endif
2228 for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
2229 Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
2230 }
2231
Blake2sp_Init(CBlake2sp * p)2232 void Blake2sp_Init(CBlake2sp *p)
2233 {
2234 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2235 p->u.header.func_Compress_Fast =
2236 #ifdef Z7_BLAKE2S_USE_VECTORS
2237 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2238 #else
2239 NULL;
2240 #endif
2241
2242 p->u.header.func_Compress_Single =
2243 #ifdef Z7_BLAKE2S_USE_VECTORS
2244 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2245 #else
2246 NULL;
2247 #endif
2248
2249 p->u.header.func_Init =
2250 #ifdef Z7_BLAKE2S_USE_VECTORS
2251 g_Z7_BLAKE2SP_FUNC_INIT_Init;
2252 #else
2253 NULL;
2254 #endif
2255
2256 p->u.header.func_Final =
2257 #ifdef Z7_BLAKE2S_USE_VECTORS
2258 g_Z7_BLAKE2SP_FUNC_INIT_Final;
2259 #else
2260 NULL;
2261 #endif
2262 #endif
2263
2264 Blake2sp_InitState(p);
2265 }
2266
2267
Blake2sp_Update(CBlake2sp * p,const Byte * data,size_t size)2268 void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
2269 {
2270 size_t pos;
2271 // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
2272 if (size == 0)
2273 return;
2274 pos = p->u.header.cycPos;
2275 // pos < SUPER_BLOCK_SIZE * 2 : is expected
2276 // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also
2277 {
2278 const size_t pos2 = pos & SUPER_BLOCK_MASK;
2279 if (pos2)
2280 {
2281 const size_t rem = SUPER_BLOCK_SIZE - pos2;
2282 if (rem > size)
2283 {
2284 p->u.header.cycPos = (unsigned)(pos + size);
2285 // cycPos < SUPER_BLOCK_SIZE * 2
2286 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2287 /* to simpilify the code here we don't try to process first superblock,
2288 if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
2289 return;
2290 }
2291 // (rem <= size)
2292 memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
2293 pos += rem;
2294 data += rem;
2295 size -= rem;
2296 }
2297 }
2298
2299 // pos <= SUPER_BLOCK_SIZE * 2
2300 // pos % SUPER_BLOCK_SIZE == 0
2301 if (pos)
2302 {
2303 /* pos == SUPER_BLOCK_SIZE ||
2304 pos == SUPER_BLOCK_SIZE * 2 */
2305 size_t end = pos;
2306 if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
2307 || (end -= SUPER_BLOCK_SIZE))
2308 {
2309 Z7_BLAKE2SP_Compress_Fast(p)(p->states,
2310 (const Byte *)(const void *)p->buf32,
2311 (const Byte *)(const void *)p->buf32 + end);
2312 if (pos -= end)
2313 memcpy(p->buf32, (const Byte *)(const void *)p->buf32
2314 + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
2315 }
2316 }
2317
2318 // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
2319 if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2320 {
2321 // pos == 0
2322 const Byte *end;
2323 const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
2324 & ~(size_t)SUPER_BLOCK_MASK;
2325 size -= size2;
2326 // size < SUPER_BLOCK_SIZE * 2
2327 end = data + size2;
2328 Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
2329 data = end;
2330 }
2331
2332 if (size != 0)
2333 {
2334 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2335 pos += size;
2336 }
2337 p->u.header.cycPos = (unsigned)pos;
2338 // cycPos < SUPER_BLOCK_SIZE * 2
2339 }
2340
2341
Blake2sp_Final(CBlake2sp * p,Byte * digest)2342 void Blake2sp_Final(CBlake2sp *p, Byte *digest)
2343 {
2344 // UInt32 * const R_states = p->states;
2345 // printf("\nBlake2sp_Final \n");
2346 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2347 if (p->u.header.func_Final)
2348 p->u.header.func_Final(p->states);
2349 #endif
2350 // printf("\n=====\nBlake2sp_Final \n");
2351 // PrintStates(p->states, 32);
2352
2353 // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
2354 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
2355 {
2356 unsigned pos;
2357 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
2358 0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
2359 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2360 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2361 {
2362 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2363 Blake2s_Set_LastBlock(s)
2364 if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
2365 {
2366 UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
2367 if (pos < p->u.header.cycPos)
2368 delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
2369 // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
2370 {
2371 const UInt32 v = STATE_T(s)[0];
2372 STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0)
2373 STATE_T(s)[0] = v - delta;
2374 }
2375 }
2376 }
2377 // PrintStates(p->states, 16);
2378 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2379 (Byte *)(void *)p->buf32,
2380 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2381 // PrintStates(p->states, 16);
2382 }
2383 else
2384 {
2385 // (p->u.header.cycPos > SUPER_BLOCK_SIZE)
2386 unsigned pos;
2387 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2388 {
2389 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2390 if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
2391 Blake2s_Set_LastBlock(s)
2392 }
2393 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2394 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2395
2396 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2397 (Byte *)(void *)p->buf32,
2398 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2399
2400 // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
2401 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2402
2403 // if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
2404 {
2405 pos = SUPER_BLOCK_SIZE;
2406 for (;;)
2407 {
2408 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
2409 Blake2s_Set_LastBlock(s)
2410 pos += Z7_BLAKE2S_BLOCK_SIZE;
2411 if (pos >= p->u.header.cycPos)
2412 {
2413 if (pos != p->u.header.cycPos)
2414 {
2415 const UInt32 delta = pos - p->u.header.cycPos;
2416 const UInt32 v = STATE_T(s)[0];
2417 STATE_T(s)[1] -= v < delta;
2418 STATE_T(s)[0] = v - delta;
2419 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
2420 }
2421 break;
2422 }
2423 }
2424 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2425 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
2426 (Byte *)(void *)p->buf32 + pos);
2427 }
2428 }
2429
2430 {
2431 size_t pos;
2432 for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
2433 {
2434 const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
2435 Byte *dest = (Byte *)(void *)p->buf32 + pos;
2436 GET_DIGEST(s, dest)
2437 }
2438 }
2439 Blake2sp_Init_Spec(p->states, 0, 1);
2440 {
2441 size_t pos;
2442 for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
2443 - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2444 {
2445 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2446 (const Byte *)(const void *)p->buf32 + pos,
2447 (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
2448 }
2449 }
2450 // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
2451 Blake2s_Set_LastBlock(p->states)
2452 STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
2453 {
2454 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2455 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
2456 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
2457 }
2458 GET_DIGEST(p->states, digest)
2459 // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
2460 }
2461
2462
Blake2sp_SetFunction(CBlake2sp * p,unsigned algo)2463 BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
2464 {
2465 // printf("\n========== setfunction = %d ======== \n", algo);
2466 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2467 Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
2468 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
2469 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2470 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2471 #else
2472 UNUSED_VAR(p)
2473 #endif
2474
2475 #ifdef Z7_BLAKE2S_USE_VECTORS
2476
2477 func = func_Single = Blake2sp_Compress2;
2478
2479 if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
2480 {
2481 // printf("\n========== setfunction NON-SCALER ======== \n");
2482 if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
2483 {
2484 func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2485 func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2486 func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init;
2487 func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final;
2488 }
2489 else
2490 {
2491 if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
2492 return False;
2493
2494 #ifdef Z7_BLAKE2S_USE_AVX2
2495
2496 func_Single =
2497 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2498 Blake2sp_Compress2_AVX2_Way2;
2499 #else
2500 Z7_BLAKE2S_Compress2_V128;
2501 #endif
2502
2503 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2504 if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
2505 {
2506 func = Blake2sp_Compress2_AVX2_Fast;
2507 func_Final = Blake2sp_Final_AVX2_Fast;
2508 func_Init = Blake2sp_InitState_AVX2_Fast;
2509 }
2510 else
2511 #endif
2512 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2513 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
2514 func = Blake2sp_Compress2_AVX2_Way2;
2515 else
2516 #endif
2517 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2518 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
2519 {
2520 func_Single = func = Blake2sp_Compress2_AVX2_Way4;
2521 }
2522 else
2523 #endif
2524 #endif // avx2
2525 {
2526 if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
2527 {
2528 func = Blake2sp_Compress2_V128_Fast;
2529 func_Final = Blake2sp_Final_V128_Fast;
2530 func_Init = Blake2sp_InitState_V128_Fast;
2531 func_Single = Z7_BLAKE2S_Compress2_V128;
2532 }
2533 else
2534 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2535 if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
2536 func = func_Single = Blake2sp_Compress2_V128_Way2;
2537 else
2538 #endif
2539 {
2540 if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
2541 return False;
2542 func = func_Single = Blake2sp_Compress2_V128_Way1;
2543 }
2544 }
2545 }
2546 }
2547 #else // !VECTORS
2548 if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
2549 return False;
2550 #endif // !VECTORS
2551
2552 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2553 p->u.header.func_Compress_Fast = func;
2554 p->u.header.func_Compress_Single = func_Single;
2555 p->u.header.func_Final = func_Final;
2556 p->u.header.func_Init = func_Init;
2557 #endif
2558 // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
2559 return True;
2560 }
2561
2562
z7_Black2sp_Prepare(void)2563 void z7_Black2sp_Prepare(void)
2564 {
2565 #ifdef Z7_BLAKE2S_USE_VECTORS
2566 unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
2567
2568 Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
2569 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
2570 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2571 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2572
2573 #if defined(MY_CPU_X86_OR_AMD64)
2574 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2575 // optional check
2576 #if 0 || !(defined(__AVX512F__) && defined(__AVX512VL__))
2577 if (CPU_IsSupported_AVX512F_AVX512VL())
2578 #endif
2579 #elif defined(Z7_BLAKE2S_USE_SSE41)
2580 if (CPU_IsSupported_SSE41())
2581 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2582 if (CPU_IsSupported_SSSE3())
2583 #elif !defined(MY_CPU_AMD64)
2584 if (CPU_IsSupported_SSE2())
2585 #endif
2586 #endif
2587 {
2588 #if defined(Z7_BLAKE2S_USE_SSE41)
2589 // printf("\n========== Blake2s SSE41 128-bit\n");
2590 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2591 // printf("\n========== Blake2s SSSE3 128-bit\n");
2592 #else
2593 // printf("\n========== Blake2s SSE2 128-bit\n");
2594 #endif
2595 // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
2596 // printf("\n========== Blake2sp_Compress2_V128_Way2\n");
2597 func_Fast =
2598 func_Single = Z7_BLAKE2S_Compress2_V128;
2599 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
2600 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2601 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
2602 #endif
2603 #ifdef Z7_BLAKE2S_USE_V128_FAST
2604 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
2605 func_Fast = Blake2sp_Compress2_V128_Fast;
2606 func_Init = Blake2sp_InitState_V128_Fast;
2607 func_Final = Blake2sp_Final_V128_Fast;
2608 #endif
2609
2610 #ifdef Z7_BLAKE2S_USE_AVX2
2611 #if defined(MY_CPU_X86_OR_AMD64)
2612
2613 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2614 #if 0
2615 if (CPU_IsSupported_AVX512F_AVX512VL())
2616 #endif
2617 #else
2618 if (CPU_IsSupported_AVX2())
2619 #endif
2620 #endif
2621 {
2622 // #pragma message ("=== Blake2s AVX2")
2623 // printf("\n========== Blake2s AVX2\n");
2624
2625 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2626 func_Single = Blake2sp_Compress2_AVX2_Way2;
2627 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
2628 #endif
2629 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2630 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
2631 #endif
2632
2633 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2634 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
2635 func_Fast = Blake2sp_Compress2_AVX2_Fast;
2636 func_Init = Blake2sp_InitState_AVX2_Fast;
2637 func_Final = Blake2sp_Final_AVX2_Fast;
2638 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
2639 func_Fast = Blake2sp_Compress2_AVX2_Way4;
2640 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2641 func_Fast = Blake2sp_Compress2_AVX2_Way2;
2642 #endif
2643 } // avx2
2644 #endif // avx2
2645 } // sse*
2646 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast;
2647 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
2648 g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init;
2649 g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final;
2650 g_z7_Blake2sp_SupportedFlags = flags;
2651 // printf("\nflags=%x\n", flags);
2652 #endif // vectors
2653 }
2654
2655 /*
2656 #ifdef Z7_BLAKE2S_USE_VECTORS
2657 void align_test2(CBlake2sp *sp);
2658 void align_test2(CBlake2sp *sp)
2659 {
2660 __m128i a = LOAD_128(sp->states);
2661 D_XOR_128(a, LOAD_128(sp->states + 4));
2662 STORE_128(sp->states, a);
2663 }
2664 void align_test2(void);
2665 void align_test2(void)
2666 {
2667 CBlake2sp sp;
2668 Blake2sp_Init(&sp);
2669 Blake2sp_Update(&sp, NULL, 0);
2670 }
2671 #endif
2672 */
2673